diff --git a/.gitignore b/.gitignore index debec551d9cd7344a31efbbb709bfbb759a15d3f..801790d0a472080af607e9fbcde0284902a4ead8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,10 +6,14 @@ paddle/fluid/eager/api/generated/* paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h +paddle/phi/api/backward/sparse_bw_api.h paddle/phi/api/include/api.h +paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc +paddle/phi/api/lib/sparse_api.cc +paddle/phi/api/lib/sparse_bw_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* @@ -52,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json +tools/infrt/kernel_signature.json paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c5f711d2918bc2a2f8322cc9cd9f3a603c56ab1..6988434996bcc4745726b34278eb6007fdf8605f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) # to develop some acl related functionality on x86 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) # Note(zhouwei): It use option above, so put here include(init) include(generic) # simplify cmake module diff --git a/README.md b/README.md index 7dc83aa695cef8ecf177dfc2c444888850342bdc..cdbf2d9f3bf9973fb6c7fe2365ea61f05ce998c1 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ English | [简体中文](./README_cn.md) Welcome to the PaddlePaddle GitHub. PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms. -PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 2.3 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. +PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. diff --git a/README_cn.md b/README_cn.md index 6b37cfd97b35729dd293452178646db8f1194ca3..3834ee148f940326a2b1e1a8d0fd63a1028b0c96 100644 --- a/README_cn.md +++ b/README_cn.md @@ -15,7 +15,7 @@ 欢迎来到 PaddlePaddle GitHub -飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者265万,服务企业10万家,基于飞桨开源深度学习平台产生了34万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 +飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础,是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台,集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前,飞桨累计开发者406万,服务企业15.7万家,基于飞桨开源深度学习平台产生了47.6万个模型。飞桨助力开发者快速实现AI想法,快速上线AI业务。帮助越来越多的行业完成AI赋能,实现产业智能化升级。 ## 安装 diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake index 41b90345c8c5f38afa413bd2411af975c9d0b103..d3f330ba9dd0fa58b26e9ea05a7154184747daff 100644 --- a/cmake/external/cinn.cmake +++ b/cmake/external/cinn.cmake @@ -26,7 +26,7 @@ add_definitions(-w) ###################################### include(ExternalProject) set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN) -set(CINN_GIT_TAG release/v0.1) +set(CINN_GIT_TAG 56879b637e2c4db19091eedad03d7cc674e092a2) set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index a7a9e85ffd7314ac7026fccdf45fae2fa3de09d3..5c48afa2806aab10bb08317679c0a00c8f177f7b 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -99,9 +99,10 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) - mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") - add_public_tablegen_target(${td_base}_IncGen) - add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) + set(LLVM_TARGET_DEPENDS ${LLVM_TARGET_DEPENDS} ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td) + mlir_tablegen(${td_base}.cpp.inc -gen-rewriters) + add_public_tablegen_target(MLIR${td_base}IncGen) + add_dependencies(mlir-headers MLIR${td_base}IncGen) endfunction() # Execute the mlir script with infrt-exec program. diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake new file mode 100644 index 0000000000000000000000000000000000000000..2162f87812d130f19262955798f28e2c2adc4bac --- /dev/null +++ b/cmake/external/onnxruntime.cmake @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if (NOT WITH_ONNXRUNTIME) + return() +endif () + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +add_definitions(-DPADDLE_WITH_ONNXRUNTIME) + +SET(ONNXRUNTIME_PROJECT "extern_onnxruntime") +SET(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime) +SET(ONNXRUNTIME_SOURCE_DIR ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT}) +SET(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime) +SET(ONNXRUNTIME_INC_DIR "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE) +SET(ONNXRUNTIME_LIB_DIR "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE) +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}") + + +if (WIN32) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip") +elseif (APPLE) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz") +else () + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz") +endif() + + +INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers. +if (WIN32) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) +elseif (APPLE) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +else () + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +endif () + +if (WIN32) + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} && + ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +else () + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +endif() + +ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB}) +ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT}) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake new file mode 100644 index 0000000000000000000000000000000000000000..661c3675c84b27a7ed8210fec0cfeaa2c858487c --- /dev/null +++ b/cmake/external/paddle2onnx.cmake @@ -0,0 +1,96 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_ONNXRUNTIME) + return() +endif() + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +SET(PADDLE2ONNX_PROJECT "extern_paddle2onnx") +SET(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) +SET(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx) +SET(PADDLE2ONNX_INC_DIR "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE) +SET(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git) +SET(PADDLE2ONNX_TAG cpp) +SET(LIBDIR "lib") +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}") + +INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers. +if(WIN32) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE) + SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE) +elseif(APPLE) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +else() + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +endif(WIN32) + + +# The protoc path is required to compile onnx. +string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE}) +list(POP_BACK PROTOC_BIN_PATH) +list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH) + + +set(PADDLE2ONNX_OPTIONAL_ARGS + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} + -DWITH_STATIC=OFF + -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} +) + +if (WITH_PYTHON) + set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE} + -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR} + -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY} + ) +endif () + + +ExternalProject_Add( + ${PADDLE2ONNX_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY} + GIT_TAG ${PADDLE2ONNX_TAG} + DEPENDS protobuf + PREFIX ${PADDLE2ONNX_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB} +) + +ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB}) +ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index f7cb7716969f5ccaa97d1ad7964510376b86870a..58ff5f0d2b715d117018eb2ff3d5989c8beb0694 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() - if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + + if(WITH_ONNXRUNTIME) + SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + SET(PROTOBUF_TAG v3.18.0) + elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) SET(PROTOBUF_TAG v3.8.0) elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) @@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -if(WITH_ASCEND OR WITH_ASCEND_CL) +if(WITH_ONNXRUNTIME) + SET(PROTOBUF_VERSION 3.18.0) +elseif(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) elseif(WITH_IPU) SET(PROTOBUF_VERSION 3.6.1) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 45a76fdc1f1a2aab66e7f4972eecbbec03af941a..cfbe68eecbaca55c5a288aae2c985bbc33d37be2 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index da81575188ffdee91887bd76b75b4a3d6eb60ae9..ba59eae392c66354b419bbfd2688a14a26f2e388 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -651,6 +651,7 @@ function(hip_test TARGET_NAME) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH") endif() endfunction(hip_test) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c48d31f7e4f90296ecc48acb56e619aae129106e..851bd81403a85e52fbbb3c4c8bf0da1df63c8848 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST) endif() endif() + if (WITH_ONNXRUNTIME) + set(dst_dir "${DST}/third_party/install/onnxruntime") + copy(${TARGET} + SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) + + set(dst_dir "${DST}/third_party/install/paddle2onnx") + if(WIN32) + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib) + else() + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib) + endif() + endif() + set(dst_dir "${DST}/third_party/install/gflags") copy(${TARGET} SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7affd59de162d5956672e5abfbf9f4b287fb7a83..1291e60cfe4ce13ca9aeeb3f8bdf068af0d5832c 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -293,11 +293,11 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op") - - if ("${TARGET}" STREQUAL "${manual_pybind_op}") - set(pybind_flag 1) - endif() - endforeach() + + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. @@ -478,7 +478,7 @@ function(op_library TARGET) if (${pybind_flag} EQUAL 0) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP(relu);\n") + file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") elseif(${TARGET} STREQUAL "fake_quantize") diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f6e15758379ada165a9dc0e31273a533b06ad2df..ebb686d8ad0f31917e64161d6f7d2ecd4644fadd 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -134,8 +134,8 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) endif() endif() if (WITH_XPU) @@ -197,92 +197,88 @@ function(kernel_library TARGET) # kernel source file level # level 1: base device kernel - # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # level 2: device-independent kernel # - common_srcs # level 3: Kernel implemented by reusing device-independent kernel # - selected_rows_srcs + set(base_device_kernels) + set(device_independent_kernel) + set(high_level_kernels) - # Build Target according different src organization - if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND - (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) - # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. + # 1. Base device kernel compile + if (${cpu_srcs_len} GREATER 0) + cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu) + endif() + if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpu) + endif() + if (${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu) + endif() + if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If the selected_rows_srcs depends on common_srcs, build target using this rule. - elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpudnn) + endif() + if (${kps_srcs_len} GREATER 0) + # only when WITH_XPU_KP, the kps_srcs_len can be > 0 + xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps) + endif() + + # 2. Device-independent kernel compile + if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - # If there are only common_srcs or selected_rows_srcs, build target using below rules. - elseif (${common_srcs_len} GREATER 0) + list(APPEND device_independent_kernel ${TARGET}_common) + endif() + + # 3. Reusing kernel compile + if (${selected_rows_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() - elseif (${selected_rows_srcs_len} GREATER 0) + list(APPEND high_level_kernels ${TARGET}_sr) + endif() + + # 4. Unify target compile + list(LENGTH base_device_kernels base_device_kernels_len) + list(LENGTH device_independent_kernel device_independent_kernel_len) + list(LENGTH high_level_kernels high_level_kernels_len) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR + ${high_level_kernels_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) else() - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) endif() else() set(target_build_flag 0) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index ac3eff04d5383ecdf6c771babcaf3e6811600ac3..7df095c6c2ec04e1a694ed2458787af285c96a9a 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE) list(APPEND third_party_deps extern_gtest) ENDIF() +if(WITH_ONNXRUNTIME) + include(external/onnxruntime) # download, build, install onnxruntime、paddle2onnx + include(external/paddle2onnx) + list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx) +endif() + if(WITH_GPU) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) include(external/cub) # download cub diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index a5b40f8aa07d77e803f2cad36155b7de1bd03719..3fca45cc068f9916b52b3f99df2baa679d4c3546 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,6 +1,13 @@ cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) -cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) +cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup phi phi_api) + +if (WITH_DISTRIBUTE) + cc_library(processgroup_gloo SRCS ProcessGroupGloo.cc DEPS phi phi_api eager_api gloo_wrapper) +endif() if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() +if(WITH_ASCEND_CL) + cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h new file mode 100644 index 0000000000000000000000000000000000000000..09789bd4d378630f548f931bcac00fda89ef33be --- /dev/null +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/enforce_npu.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class NPUEventManager { + public: + NPUEventManager() = default; + + ~NPUEventManager() { + if (is_created_) { + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventDestroy(event_); + } + } + + NPUEventManager(const NPUEventManager&) = delete; + NPUEventManager& operator=(const NPUEventManager&) = delete; + + NPUEventManager(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + NPUEventManager& operator=(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + aclrtEvent GetRawNPUEvent() const { return event_; } + + void Record(const paddle::platform::NPUDeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "NPUDeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventRecord(event_, ctx.stream()); + } + + bool Query() const { + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(event_, &status); + if (status == ACL_EVENT_STATUS_COMPLETE) { + return true; + } + return false; + } + + void Block(const paddle::platform::NPUDeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::NPUDeviceGuard guard(device_index_); + platform::NPUStreamWaitEvent(ctx.stream(), event_); + } + } + + private: + bool is_created_{false}; + aclrtEvent event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::NPUDeviceGuard guard(device_index); + platform::NPUEventCreate(&event_); + is_created_ = true; + } +}; + +class HCCLCommManager { + public: + explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {} + + HCCLCommManager() : HCCLCommManager(nullptr) {} + + ~HCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (hccl_comm_) { + platform::dynload::HcclCommDestroy(hccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + HcclRootInfo* comm_id, + HcclComm hccl_comm) { + auto hccl_manager = std::make_shared(); + auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank, + &hccl_comm); + using __NPU_STATUS_TYPE__ = decltype(ret); + constexpr auto __success_type__ = + platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess; + if (UNLIKELY(ret != __success_type__)) { + VLOG(0) << "Error: create hccl_id error."; + exit(-1); + } + + hccl_manager->hccl_id_ = comm_id; + hccl_manager->rank_ = rank; + hccl_manager->hccl_comm_ = hccl_comm; + return hccl_manager; + } + + HcclRootInfo* GetHcclId() const { + std::unique_lock lock(mutex_); + return hccl_id_; + } + + HcclComm GetHcclComm() const { + std::unique_lock lock(mutex_); + return hccl_comm_; + } + + HCCLCommManager(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(HCCLCommManager&& other) = delete; + + HCCLCommManager(HCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(hccl_comm_, other.hccl_comm_); + } + + protected: + HcclComm hccl_comm_; + HcclRootInfo* hccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h index e4f272052024245bf7df7fc841d5e3b18978faf7..e43d0e8c183c7005f31b66c4c29dfc95361485e4 100644 --- a/paddle/fluid/distributed/collective/ProcessGroup.h +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -117,6 +117,35 @@ class ProcessGroup { "ProcessGroup%s does not support receive", GetBackendName())); } + virtual std::shared_ptr AllGather( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllGather", GetBackendName())); + } + + virtual std::shared_ptr AllToAll( + std::vector& in /* tensors */, // NOLINT + std::vector& out /* tensors */) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support AllToAll", GetBackendName())); + } + + virtual std::shared_ptr Reduce( + std::vector& tensors /* tensors */, // NOLINT + const ReduceOptions& opts) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Reduce", GetBackendName())); + } + + virtual std::shared_ptr Scatter( + std::vector& in_tensors /* tensors */, // NOLINT + std::vector& out_tensors /* tensors */, // NOLINT + const ScatterOptions&) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support Scatter", GetBackendName())); + } + protected: const int rank_; const int size_; diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc new file mode 100644 index 0000000000000000000000000000000000000000..5dc43af117825bf95407255e93e1e4600e8ddd9a --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc @@ -0,0 +1,502 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#ifdef _WIN32 +#include +#include +#include +#else +#include +#include +#include +#endif + +#include +#include +#include +#include "paddle/fluid/distributed/collective/ProcessGroupGloo.h" +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +#ifdef _WIN32 +#define GENERATE_FUNC(type, func, ...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT32: \ + func(__VA_ARGS__); \ + break; \ + case experimental::DataType::INT64: \ + func(__VA_ARGS__); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } + +#define HOST_NAME_MAX 256 + +#else +#define GENERATE_FUNC(type, func, args...) \ + switch (type) { \ + case experimental::DataType::FLOAT32: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT64: \ + func(args); \ + break; \ + case experimental::DataType::FLOAT16: \ + func(args); \ + break; \ + case experimental::DataType::INT32: \ + func(args); \ + break; \ + case experimental::DataType::INT64: \ + func(args); \ + break; \ + default: \ + VLOG(0) << "Error: Unknown DataType."; \ + exit(-1); \ + } +#endif + +typedef void (*reduce_func)(void*, const void*, const void*, size_t); + +template +reduce_func get_function(const ReduceOp& r) { + switch (r) { + case ReduceOp::SUM: + return reduce_func(&::gloo::sum); + case ReduceOp::PRODUCT: + return reduce_func(&::gloo::product); + case ReduceOp::MIN: + return reduce_func(&::gloo::min); + case ReduceOp::MAX: + return reduce_func(&::gloo::max); + case ReduceOp::AVG: + VLOG(0) << "Error: Unsupported ReduceOp::AVG."; + exit(-1); + } + + VLOG(0) << "Error: Unknown ReduceOp."; + exit(-1); +} + +bool CheckTensorsInCPUPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place() == PlaceType::kCPU; + }); +} + +template +T* get_data(const Tensor& tensor) { + auto raw_tensor = std::dynamic_pointer_cast(tensor.impl()); + return static_cast(raw_tensor->data()); +} + +template +std::vector get_multi_data(const std::vector& tensors) { + std::vector ret(tensors.size()); + for (size_t i = 0; i < tensors.size(); i++) { + ret[i] = get_data(tensors[i]); + } + return ret; +} + +template +void set_output(P& opts, const Tensor& tensor) { // NOLINT + opts.setOutput(get_data(tensor), tensor.numel()); +} + +template +void set_input(P& opts, const Tensor& tensor) { // NOLINT + opts.setInput(get_data(tensor), tensor.numel()); +} + +template +void set_outputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setOutputs(get_multi_data(tensors), tensors[0].numel()); +} + +template +void set_inputs(P& opts, const std::vector& tensors) { // NOLINT + opts.setInputs(get_multi_data(tensors), tensors[0].numel()); +} + +template +void set_inputs_for_scatter(P& opts, // NOLINT + const std::vector& tensors, // NOLINT + int nranks) { + std::vector ret(nranks); + auto raw_tensor = + std::dynamic_pointer_cast(tensors[0].impl()); + T* raw_pointer = reinterpret_cast(raw_tensor->data()); + size_t offset = 0; + for (int i = 0; i < nranks; i++) { + ret[i] = raw_pointer + offset; + offset += tensors[0].numel() / nranks; + } + opts.setInputs(ret, tensors[0].numel() / nranks); +} + +ProcessGroupGloo::GlooTask::GlooTask(int rank, + const std::vector& inputs, + CommType comm_type) + : ProcessGroup::Task(rank, inputs, comm_type) { + PADDLE_ENFORCE_EQ(CheckTensorsInCPUPlace(inputs), true, + platform::errors::Fatal( + "Only CPU place is supported for ProcessGroupGloo.")); +} + +ProcessGroupGloo::ProcessGroupGloo(const std::shared_ptr& store, + int rank, int world_size, + const std::shared_ptr options) + : ProcessGroup(rank, world_size), _tag(0), _store(store) { + _context = std::make_shared(rank, world_size); + auto prefix_store = + ::gloo::rendezvous::PrefixStore(std::to_string(0), *_store); + _context->connectFullMesh(prefix_store, options->device); +} + +class BroadcastGlooTask : public ProcessGroupGloo::GlooTask { + public: + BroadcastGlooTask(const std::shared_ptr& context, + const std::vector& inputs, int rank, int root, + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::BROADCAST), + _context(context), + _root(root), + _inputs(inputs), + _tag(tag) {} + + void Run() override { _do_broadcast(_inputs[0]); } + + private: + std::shared_ptr _context; + const int _root; + std::vector _inputs{}; + const uint32_t _tag; + + void _do_broadcast(const Tensor& tensor) { + gloo::BroadcastOptions opts(_context); + const auto& dtype = tensor.type(); + GENERATE_FUNC(dtype, set_output, opts, tensor); + opts.setRoot(_root); + opts.setTag(_tag); + gloo::broadcast(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Broadcast( + std::vector& inputs, const BroadcastOptions& opts) { + auto root = opts.source_rank; + std::unique_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_unique(context, inputs, rank_, root, tag); + task->Run(); + return task; +} + +class AllreduceGlooTask : public ProcessGroupGloo::GlooTask { + public: + AllreduceGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, ReduceOp reduce_op, // NOLINT + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLREDUCE), + _context(context), + _inputs(inputs), + _reduce_op(reduce_op), + _tag(tag) {} + + void Run() override { _do_allreduce(_inputs); } + + private: + std::shared_ptr _context; + std::vector _inputs; + const ReduceOp _reduce_op; + uint32_t _tag; + + gloo::AllreduceOptions::Func _get_function(const experimental::DataType type, + const ReduceOp op) { + gloo::AllreduceOptions::Func fn; + GENERATE_FUNC(type, _get_function_impl, fn, op); + return fn; + } + + template + void _get_function_impl(gloo::AllreduceOptions::Func& fn, // NOLINT + const ReduceOp op) { + fn = get_function(op); + } + + void _do_allreduce(std::vector& tensors) { // NOLINT + const auto& dtype = tensors[0].type(); + gloo::AllreduceOptions opts(_context); + GENERATE_FUNC(dtype, set_inputs, opts, tensors); + GENERATE_FUNC(dtype, set_outputs, opts, tensors); + opts.setReduceFunction(_get_function(dtype, _reduce_op)); + opts.setTag(_tag); + gloo::allreduce(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::AllReduce( + std::vector& inputs, const AllreduceOptions& opts) { + auto tag = next_tag(); + std::shared_ptr task; + auto context = get_context(); + task = std::make_shared(rank_, context, inputs, + opts.reduce_op, tag); + task->Run(); + return task; +} + +class BarrierGlooTask : public ProcessGroupGloo::GlooTask { + public: + BarrierGlooTask(int rank, const std::shared_ptr& context) + : ProcessGroupGloo::GlooTask(rank, std::vector{}, + CommType::BARRIER), + _context(context) {} + + void Run() override { _do_barrier(); } + + private: + std::shared_ptr _context; + + void _do_barrier() { + gloo::BarrierOptions opts(_context); + gloo::barrier(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Barrier( + const BarrierOptions& opts) { + std::shared_ptr task; + auto context = get_context(); + task = std::make_shared(rank_, context); + task->Run(); + return task; +} + +class AllgatherGlooTask : public ProcessGroupGloo::GlooTask { + public: + AllgatherGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::ALLGATHER), + _context(context), + _inputs(inputs), + _outputs(outputs), + _tag(tag) {} + + void Run() override { _do_allgather(_inputs, _outputs); } + + private: + std::shared_ptr _context; + std::vector _inputs; + std::vector _outputs; + uint32_t _tag; + + void _do_allgather(std::vector& in, // NOLINT + std::vector& out) { // NOLINT + const auto& dtype = in[0].type(); + gloo::AllgatherOptions opts(_context); + GENERATE_FUNC(dtype, set_input, opts, in[0]); + GENERATE_FUNC(dtype, set_output, opts, out[0]); + opts.setTag(_tag); + gloo::allgather(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::AllGather( + std::vector& in_tensors, std::vector& out_tensors) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared(rank_, context, in_tensors, + out_tensors, tag); + task->Run(); + return task; +} + +class ReduceGlooTask : public ProcessGroupGloo::GlooTask { + public: + ReduceGlooTask(int rank, const std::shared_ptr& context, + std::vector& in, ReduceOp reduce_op, // NOLINT + int dst, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, in, CommType::REDUCE), + _context(context), + _inputs(in), + _reduce_op(reduce_op), + _dst(dst), + _tag(tag) {} + + void Run() override { _do_reduce(_inputs, _dst); } + + private: + std::shared_ptr _context; + std::vector _inputs; + const ReduceOp _reduce_op; + int _dst; + uint32_t _tag; + + gloo::ReduceOptions::Func _get_function(const experimental::DataType type, + const ReduceOp op) { + gloo::ReduceOptions::Func fn; + GENERATE_FUNC(type, _get_function_impl, fn, op); + return fn; + } + + template + void _get_function_impl(gloo::ReduceOptions::Func& fn, // NOLINT + const ReduceOp op) { + fn = get_function(op); + } + + void _do_reduce(std::vector& tensors, int dst) { // NOLINT + const auto& dtype = tensors[0].type(); + gloo::ReduceOptions opts(_context); + GENERATE_FUNC(dtype, set_input, opts, tensors[0]); + GENERATE_FUNC(dtype, set_output, opts, tensors[0]); + opts.setReduceFunction(_get_function(dtype, _reduce_op)); + opts.setTag(_tag); + opts.setRoot(dst); + gloo::reduce(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Reduce( + std::vector& tensors, const ReduceOptions& opts) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared(rank_, context, tensors, + opts.reduce_op, opts.root_rank, tag); + task->Run(); + return task; +} + +class ScatterGlooTask : public ProcessGroupGloo::GlooTask { + public: + ScatterGlooTask(int rank, const std::shared_ptr& context, + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + int src, int size, uint32_t tag) + : ProcessGroupGloo::GlooTask(rank, inputs, CommType::SCATTER), + _context(context), + _inputs(inputs), + _outputs(outputs), + _src(src), + _size(size), + _tag(tag) {} + + void Run() override { _do_scatter(_inputs, _outputs, _src); } + + private: + std::shared_ptr _context; + std::vector _inputs; + std::vector _outputs; + int _src; + int _size; + uint32_t _tag; + + void _do_scatter(std::vector& in, std::vector& out, // NOLINT + int src) { + const auto& dtype = in[0].type(); + gloo::ScatterOptions opts(_context); + if (rank_ == src) { + GENERATE_FUNC(dtype, set_inputs_for_scatter, opts, in, _size); + } + GENERATE_FUNC(dtype, set_output, opts, out[0]); + opts.setRoot(src); + opts.setTag(_tag); + gloo::scatter(opts); + } +}; + +std::shared_ptr ProcessGroupGloo::Scatter( + std::vector& in_tensors, std::vector& out_tensors, + const ScatterOptions& opts) { + std::shared_ptr task; + auto tag = next_tag(); + auto context = get_context(); + task = std::make_shared( + rank_, context, in_tensors, out_tensors, opts.root_rank, size_, tag); + task->Run(); + return task; +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForInterface(const std::string& ifname) { + ::gloo::transport::tcp::attr attr; + attr.iface = ifname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDeviceForHostname(const std::string& hostname) { + ::gloo::transport::tcp::attr attr; + attr.hostname = hostname; + return ::gloo::transport::tcp::CreateDevice(attr); +} + +std::shared_ptr<::gloo::transport::Device> +ProcessGroupGloo::createDefaultDevice() { + std::array hostname{}; + auto ret = ::gethostname(hostname.data(), HOST_NAME_MAX); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::Fatal( + "Get hostname error for createDefaultDevice.")); + ::addrinfo* result; + result = tcputils::get_addr_info(hostname.data(), "", 0, AF_UNSPEC); + ::addrinfo* cur; + for (cur = result; cur != nullptr; cur = cur->ai_next) { + SocketType socket = + ::socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); + if (socket == -1) { + continue; + } + ret = ::bind(socket, cur->ai_addr, cur->ai_addrlen); +#ifdef _WIN32 + closesocket(socket); +#else + close(socket); +#endif + if (ret == -1) { + continue; + } + break; + } + freeaddrinfo(result); + if (cur != nullptr) { + return createDeviceForHostname(hostname.data()); + } + return createDeviceForHostname("127.0.0.1"); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h new file mode 100644 index 0000000000000000000000000000000000000000..24f156571a427128f09cd28e632212f47fa4cd47 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h @@ -0,0 +1,152 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" + +#ifdef PADDLE_WITH_GLOO +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/distributed/store/tcp_store.h" + +constexpr const char* GLOO_BACKEND_NAME = "GLOO"; + +namespace paddle { +namespace distributed { + +class ProcessGroupGloo : public ProcessGroup { + public: + class GlooTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + explicit GlooTask(int rank, const std::vector& input_tensors, + CommType comm_type); + + ~GlooTask() = default; + + virtual void Run() = 0; + bool Wait(std::chrono::milliseconds timeout) override { return true; } + bool IsCompleted() override { return true; } + void Synchronize() override {} + + protected: + friend class ProcessGroupGloo; + }; + + class GlooStore : public ::gloo::rendezvous::Store { + public: + explicit GlooStore( + const std::shared_ptr& store) + : _store(store) {} + + ~GlooStore() = default; + + std::vector get(const std::string& key) override { + VLOG(3) << "GlooStore::get"; + auto value = _store->get(key); + return std::vector(value.begin(), value.end()); + } + + void wait(const std::vector& keys) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + } + + void set(const std::string& key, const std::vector& value) override { + VLOG(3) << "GlooStore::set"; + std::vector tmp(value.begin(), value.end()); + _store->set(key, tmp); + } + + void wait(const std::vector& keys, + const std::chrono::milliseconds& timeout) override { + VLOG(3) << "GlooStore::wait"; + for (auto& key : keys) { + _store->wait(key); + } + // wait(keys); + } + + protected: + std::shared_ptr _store; + }; + + class GlooOptions { + public: + GlooOptions() = default; + ~GlooOptions() = default; + static std::shared_ptr create() { + return std::make_shared(); + } + std::shared_ptr<::gloo::transport::Device> device; + }; + + explicit ProcessGroupGloo(const std::shared_ptr& store, int rank, + int world_size, + std::shared_ptr options); + + ~ProcessGroupGloo() = default; + + std::shared_ptr Broadcast( + std::vector& inputs, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr AllReduce( + std::vector& inputs, + const AllreduceOptions& opts = AllreduceOptions()) override; + + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + + std::shared_ptr<::gloo::Context> get_context() { return _context; } + uint64_t next_tag() { return _tag++; } + + const std::string GetBackendName() const override { + return GLOO_BACKEND_NAME; + } + + // Helper functions for Gloo. + static std::shared_ptr<::gloo::transport::Device> createDeviceForHostname( + const std::string& hostname); + static std::shared_ptr<::gloo::transport::Device> createDeviceForInterface( + const std::string& ifname); + static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(); + + protected: + uint32_t _tag; + std::shared_ptr _context; + std::shared_ptr _store; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc new file mode 100644 index 0000000000000000000000000000000000000000..2deeb7ca03003d0b6c8fa0948afa0a3394639f8b --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -0,0 +1,354 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/npu/hccl_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" + +DECLARE_bool(hccl_blocking_wait); +// DECLARE_bool(use_stream_safe_npu_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static HcclReduceOp ToHCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, HCCL_REDUCE_MIN}, + {ReduceOp::MAX, HCCL_REDUCE_MAX}, + {ReduceOp::SUM, HCCL_REDUCE_SUM}, + {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ( + it != red_type.end(), true, + platform::errors::InvalidArgument("Invalid hccl reduction. " + "Must be Min | Max | Prod | Sum")); + return it->second; +} + +std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) { + const uint8_t* bytes = reinterpret_cast(&hcclID); + std::ostringstream oss; + for (size_t i = 0; i < sizeof(hcclID); ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +// bool CheckTensorsInNPUPlace(const std::vector& tensors) { +// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { +// return t.place() == platform::DeviceType::NPU; +// }); +// } + +void SyncDefaultStream( + const std::vector& places, + std::vector& hcclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + hcclEvents[i].Record(*dev_ctx[i]); + hcclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupHCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + hcclComms_.resize(places.size()); +} + +ProcessGroupHCCL::HCCLTask::~HCCLTask() {} + +void ProcessGroupHCCL::HCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + platform::NPUStreamWaitEvent(default_ctx->stream(), + control_events_[i].GetRawNPUEvent()); + } +} + +bool ProcessGroupHCCL::HCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sandyhouse): Add timeout for wait, now timeout unused +bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + return true; +} + +// Same as Wait +void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr& store, + int rank, int size) + : ProcessGroup(rank, size), store_(store) {} + +void ProcessGroupHCCL::BroadcastUniqueHCCLID( + std::vector& hccl_ids) { // NOLINT + if (rank_ == 0) { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto hccl_id = std::vector( + reinterpret_cast(&hccl_ids[i]), + reinterpret_cast(&hccl_ids[i]) + sizeof(HcclRootInfo)); + store_->set(key, hccl_id); + } + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&hccl_ids[i], ret.data(), ret.size()); + } + } +} + +// create HCCLManager cache for places_key +void ProcessGroupHCCL::CreateHCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the HCCL Communicator since " + "the NPU place are not known")); + + std::vector> hccl_comms; + hccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector hccl_ids; + hccl_ids.resize(1); + auto& hccl_id = hccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id)); + } + BroadcastUniqueHCCLID(hccl_ids); + + VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key + << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + std::unique_ptr comms(new HcclComm[places.size()]); + for (size_t i = 0; i < places.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id, + comms.get() + i); + dev_ctx[i].reset(new NPUDeviceContext(places[i])); + } + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupHCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < inputs.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(inputs[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream); + } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +template +std::shared_ptr ProcessGroupHCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < tensors.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(tensors[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); + } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupHCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // NPUPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupHCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclBroadcast( + input_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h new file mode 100644 index 0000000000000000000000000000000000000000..83d509be2cdd7b79faf4e2a2f510c34361b94157 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -0,0 +1,129 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/device/npu/npu_stream.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/distributed/collective/HCCLTools.h" +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +constexpr const char* HCCL_BACKEND_NAME = "HCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using NPUStream = platform::stream::NPUStream; +using NPUDeviceContext = paddle::platform::NPUDeviceContext; + +class ProcessGroupHCCL : public ProcessGroup { + public: + class HCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + HCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~HCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> hcclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(HCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + std::shared_ptr store_; + std::shared_ptr hccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_hcclcomm_; + + std::unordered_map> + places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + std::set used_place_ids_; + + private: + void BcastHCCLId(std::vector& hccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueHCCLID(std::vector& hccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + + void CreateHCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 5d96e730aa4b1aeae3fc242ca43f63d909325a4e..7f21bcee87ab705097d3c2beaf799e5f2d93b833 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -88,8 +88,8 @@ void SyncDefaultStream( for (size_t i = 0; i < places.size(); ++i) { auto* default_ctx = static_cast( platform::DeviceContextPool::Instance().Get(places[i])); - ncclEvents[i].Record(*dev_ctx[i]); - ncclEvents[i].Block(*default_ctx); + ncclEvents[i].Record(*default_ctx); + ncclEvents[i].Block(*dev_ctx[i]); } } @@ -156,36 +156,27 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { // Same as Wait void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } -ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy, +ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size) - : ProcessGroup(rank, size), strategy_(strategy) {} - -void ProcessGroupNCCL::BcastNCCLId( - std::vector& nccl_ids, // NOLINT - int root, int server_fd) { - if (strategy_.local_rank_ == root) { - std::vector other_trainers; - for (auto& ep : strategy_.trainer_endpoints_) { - if (ep != strategy_.current_endpoint_) { - other_trainers.push_back(ep); - } - } - platform::SendBroadCastCommID(other_trainers, &nccl_ids); - } else { - platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, - &nccl_ids); - } -} + : ProcessGroup(rank, size), store_(store) {} void ProcessGroupNCCL::BroadcastUniqueNCCLID( std::vector& nccl_ids) { // NOLINT - - int server_fd = -1; - if (rank_ != 0) { - server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) - .socket(); + if (rank_ == 0) { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto nccl_id = std::vector( + reinterpret_cast(&nccl_ids[i]), + reinterpret_cast(&nccl_ids[i]) + NCCL_UNIQUE_ID_BYTES); + store_->set(key, nccl_id); + } + } else { + for (size_t i = 0; i < nccl_ids.size(); i++) { + auto key = "ProcessGroupNCCL/nccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&nccl_ids[i], ret.data(), ret.size()); + } } - BcastNCCLId(nccl_ids, 0, server_fd); } // create NCCLManager cache for places_key @@ -213,8 +204,8 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( } BroadcastUniqueNCCLID(nccl_ids); - VLOG(3) << "init nccl rank: " << strategy_.local_rank_ - << ", nranks: " << strategy_.nranks_ << ", place: " << places_key + VLOG(3) << "init nccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); std::vector> dev_ctx; @@ -473,5 +464,148 @@ std::shared_ptr ProcessGroupNCCL::Recv( return task; } +std::shared_ptr ProcessGroupNCCL::AllGather( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All outputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclAllGather( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), comm, stream); + }, + CommType::ALLGATHER); +} + +void* GetPointerByOffset(void* raw_pointer, size_t offset, + experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT32) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::INT64) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else if (type == experimental::DataType::FLOAT16) { + return reinterpret_cast(reinterpret_cast(raw_pointer) + + offset); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in nccl is not supported.")); + } +} + +std::shared_ptr ProcessGroupNCCL::AllToAll( + std::vector& in_tensors, std::vector& out_tensors) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + GetPointerByOffset(output_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Reduce( + std::vector& tensors, const ReduceOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( + input_tensor->data(), output_tensor->data(), input.numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), opts.root_rank, comm, stream)); + }, + CommType::REDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Scatter( + std::vector& in_tensors, std::vector& out_tensors, + const ScatterOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(in_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(out_tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + in_tensors, out_tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + size_t offset = 0; + if (rank_ == opts.root_rank) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < size_; i++) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + GetPointerByOffset(input_tensor->data(), offset, input.type()), + input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), i, comm, stream)); + offset += input_tensor->numel() / size_; + } + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + } else { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + output_tensor->data(), input_tensor->numel() / size_, + platform::ToNCCLDataType(input.type()), opts.root_rank, comm, + stream)); + } + }, + CommType::SCATTER); +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index cfeb6467f0dbf21f116b1880f8b64a55bb2314a1..aa2a2b8fa2088cd30729ba5e6184ef7a9c507bf3 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" @@ -75,7 +76,7 @@ class ProcessGroupNCCL : public ProcessGroup { private: }; - ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size); + ProcessGroupNCCL(const std::shared_ptr& store, int rank, int size); const std::string GetBackendName() const override { return std::string(NCCL_BACKEND_NAME); @@ -98,13 +99,27 @@ class ProcessGroupNCCL : public ProcessGroup { std::shared_ptr Recv(std::vector& tensors, int src_rank) override; + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, const std::vector& inputs); protected: - ProcessGroupStrategy strategy_; + std::shared_ptr store_; std::shared_ptr nccl_comm_; std::mutex mutex_; std::unordered_map>> diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h index 699222ac452dbcc2f0b1b41c70c6036dc915a427..973f7c643542757c0bce68f8ccdefeadc97f15d4 100644 --- a/paddle/fluid/distributed/collective/Types.h +++ b/paddle/fluid/distributed/collective/Types.h @@ -36,5 +36,14 @@ struct BarrierOptions { std::vector place_ids; }; +struct ReduceOptions { + ReduceOp reduce_op = ReduceOp::SUM; + int root_rank = 0; +}; + +struct ScatterOptions { + int root_rank = 0; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index 59f3ea3b0a7d85651e7780b4b11875f19b70931e..5533f3f4cbf4b136c52b35cb74afefb86cbe73d7 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/distributed/collective/reducer.h" -#include "paddle/phi/common/data_type.h" namespace paddle { namespace distributed { @@ -127,5 +126,430 @@ std::vector> Eager_AssignGroupBySize( return res; } +template +static void ConcatTensorsForAllReduce( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents) { + operators::math::ConcatFunctor concat_functor_; + concat_functor_( + context, dense_tensors_, 0, + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get()); +} + +template +static void SplitTensorsForAllReduce( + const DeviceContext &context, Tensor *p_dense_contents, + std::vector *p_dense_tensors) { + auto *in = + std::dynamic_pointer_cast(p_dense_contents->impl()) + .get(); + std::vector outs; + std::vector shape_refer; + + outs.reserve(p_dense_tensors->size()); + shape_refer.reserve(p_dense_tensors->size()); + + for (auto &tensor : *p_dense_tensors) { + outs.emplace_back(&tensor); + shape_refer.emplace_back(&tensor); + } + + operators::math::SplitFunctor split_functor_; + split_functor_(context, *in, shape_refer, 0, &outs); +} + +// context is used to select the stream for concat +template +static void ConcatTensorsWithType( + const DeviceContext &context, + const std::vector &dense_tensors_, + Tensor *p_dense_contents, phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case phi::DataType::FLOAT32: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + case phi::DataType::FLOAT64: + ConcatTensorsForAllReduce(context, dense_tensors_, + p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + type)); + } +} + +// context is used to select the stream for split +template +static void SplitTensorsWithType(const DeviceContext &context, + Tensor *p_dense_contents, + std::vector *p_dense_tensors, + phi::DataType type) { + switch (type) { + case phi::DataType::FLOAT16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case phi::DataType::FLOAT32: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + case phi::DataType::FLOAT64: + SplitTensorsForAllReduce(context, p_dense_contents, + p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + type)); + } +} + +void EagerGroup::ConcatTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat grad tensors since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + ConcatTensorsWithType(*default_ctx, dense_tensors_, &dense_contents_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Concat grad tensor not supported on place (%s)", place)); + } +} + +void EagerGroup::SplitTensors(const platform::Place &place) { + if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split grad tensor since it's not compiled with NCCL," + "Please recompile or reinstall Paddle with NCCL support.")); +#endif + } else if (platform::is_cpu_place(place)) { + auto *default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + SplitTensorsWithType(*default_ctx, &dense_contents_, &dense_tensors_, + dtype_); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Split grad tensor not supported on place (%s)", place)); + } +} + +EagerReducer::EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, bool find_unused_parameters) + : tensors_(tensors), + group_indices_(group_indices), + is_sparse_gradient_(is_sparse_gradient), + process_group_(process_group), + group_size_limits_(group_size_limits), + find_unused_vars_each_step_(find_unused_parameters) { + VLOG(3) << "Start construct the Reducer ..."; + + nranks_ = process_group_->GetSize(); + + // initialize groups + InitializeGroups(group_indices); + + for (size_t global_var_index = 0; global_var_index < tensors_.size(); + ++global_var_index) { + auto tensor = tensors_[global_var_index]; + auto reduce_hook = [=](void) -> void { + this->AddDistHook(global_var_index); + }; + + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation")); + const auto &accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + accumulation_grad_node->RegisterReduceHook( + std::make_shared(reduce_hook)); + } + + vars_marked_ready_.resize(tensors_.size(), false); + local_used_vars_.resize(tensors_.size(), 0); +} + +std::shared_ptr EagerReducer::GetGradNodeFromTensor( + Tensor *tensor) { + auto *autograd_meta = tensor->get_autograd_meta(); + const auto &grad_node = + static_cast(autograd_meta)->GetMutableGradNode(); + return grad_node; +} + +void EagerReducer::InitializeGroups( + const std::vector> &group_indices) { + VLOG(3) << "Start initialize groups .."; + + // clear the group + groups_.clear(); + groups_.reserve(group_indices.size()); + + variable_locators_.clear(); + variable_locators_.resize(tensors_.size()); + + auto group_nums = group_indices.size(); + for (size_t group_index = 0; group_index < group_nums; ++group_index) { + const auto &tensor_indices_ = group_indices[group_index]; + PADDLE_ENFORCE_GT( + tensor_indices_.size(), 0, + platform::errors::PreconditionNotMet( + "The number of group[%d]'s elements is 0.", group_index)); + + EagerGroup group; + + // It's just for check the sparse or dense + auto first_var = tensors_[tensor_indices_.front()]; + if (tensor_indices_.size() == 1 && + is_sparse_gradient_[tensor_indices_.front()]) { + // process the sparse gradient. one sparse, one group + group.dtype_ = first_var.dtype(); + } else { + // process the dense gradient. + InitializeDenseGroups(tensor_indices_, &group); + experimental::Backend backend; + switch (inner_place_.GetType()) { + case phi::AllocationType::GPU: + backend = experimental::Backend::GPU; + break; + case phi::AllocationType::CPU: + backend = experimental::Backend::CPU; + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Place type (%s) is not supported. ", inner_place_)); + break; + } + group.dense_contents_ = paddle::experimental::empty( + ScalarArray({group.all_length_}), group.dtype_, backend); + } + + // map tensors to this group by VariableLocator + size_t inside_group_index = 0; + for (const auto var_index : tensor_indices_) { + TensorLocator tensor_locator; + tensor_locator.group_index = group_index; + tensor_locator.inside_group_index = inside_group_index++; + variable_locators_[var_index] = tensor_locator; + } + group.tensor_indices_ = std::move(tensor_indices_); + groups_.emplace_back(std::move(group)); + + VLOG(3) << "The Group[" << group_index << "]:" << groups_.back(); + } +} + +void EagerReducer::InitializeDenseGroups( + const std::vector &tensor_indices_, EagerGroup *p_group) { + VLOG(3) << "InitializeDenseGroups."; + int64_t all_length = 0; + for (size_t index = 0; index < tensor_indices_.size(); ++index) { + auto tensor_index = tensor_indices_[index]; + auto &tensor = tensors_[tensor_index]; + auto &tensor_name = tensor.name(); + + PADDLE_ENFORCE_EQ(tensor.is_initialized(), true, + platform::errors::PreconditionNotMet( + "Tensor %s is not initialized.", tensor_name)); + const auto size = tensor.numel(); + PADDLE_ENFORCE_GT( + size, 0, platform::errors::PreconditionNotMet( + "The number of tensor %s's elements is 0.", tensor_name)); + all_length += size; + + p_group->length_.push_back(size); + + // for concat operator + p_group->origin_shapes_.push_back(ScalarArray(tensor.shape())); + p_group->dense_tensors_.push_back(phi::DenseTensor()); + + const auto &dtype = tensor.dtype(); + const auto &place = tensor.place(); + const auto &inner_place = tensor.impl()->place(); + if (index > 0) { + PADDLE_ENFORCE_EQ(dtype, p_group->dtype_, + platform::errors::PreconditionNotMet( + "Tensor %s has unexpected dtype.", tensor_name)); + PADDLE_ENFORCE_EQ(place, place_, + platform::errors::PreconditionNotMet( + "Tensor %s has different place. Expected place is " + "%s, but actual place is %s", + tensor_name, inner_place_, inner_place)); + } else { + p_group->dtype_ = dtype; + place_ = place; + inner_place_ = inner_place; + } + } + p_group->all_length_ = all_length; +} + +void EagerReducer::PrepareForBackward(const std::vector &outputs) { + VLOG(3) << "after forward, then reset count for backward."; + grad_need_hooks_ = true; + next_group_ = 0; + std::for_each(groups_.begin(), groups_.end(), [](EagerGroup &group) { + group.pending_ = group.tensor_indices_.size(); + }); + + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(tensors_.size(), false); +} + +void EagerReducer::AddDistHook(size_t var_index) { + PADDLE_ENFORCE_LT(var_index, variable_locators_.size(), + platform::errors::OutOfRange( + "Out of bounds variable index. it must be less" + "than %d, but it is %d", + variable_locators_.size(), var_index)); + + // gradient synchronization is not required when grad_need_hooks_ is false. + if (!grad_need_hooks_) { + return; + } + + auto &tensor = tensors_[var_index]; + const auto &grad_node = GetGradNodeFromTensor(&tensor); + + VLOG(3) << "Var[" << var_index << "] [" << (*grad_node).name() + << "] arrived and triggered disthook"; + + local_used_vars_[var_index] = 1; + + MarkVarReady(var_index, true); +} + +void EagerReducer::MarkVarReady(const size_t var_index, + const bool is_used_var) { + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto inside_group_index = var_locator.inside_group_index; + + auto &group = groups_[group_index]; + auto &group_tensor = group.dense_tensors_[inside_group_index]; + auto *autograd_meta = tensors_[var_index].get_autograd_meta(); + auto &grad_tensor = static_cast(autograd_meta)->Grad(); + + group_tensor + .ShareDataWith( + *(std::dynamic_pointer_cast(grad_tensor.impl()))) + .Resize({grad_tensor.numel()}); + + vars_marked_ready_[var_index] = true; + + if (--group.pending_ == 0) { + // can start allreduce + MarkGroupReady(group_index); + } +} + +void EagerReducer::MarkGroupReady(size_t group_index) { + VLOG(3) << "Group[" << group_index << "] is ready"; + + PADDLE_ENFORCE_GE( + group_index, next_group_, + platform::errors::PreconditionNotMet( + "The index of the incoming group must be greater " + "than or equal to the previously synchronized group index, " + "expect it to greater than or equal to %d, but got %d.", + next_group_, group_index)); + + if (group_index > next_group_) { + VLOG(3) << "It will adjust the order of group in next batch automatically"; + return; + } + + for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; + ++next_group_) { + UNUSED auto &group = groups_[next_group_]; + FusedAllReduceSchedule(&group, next_group_); + } +} + +void EagerReducer::FusedAllReduceSchedule(EagerGroup *group, + const int curr_group_index) { + // The overall timeline: concat > div_nranks > allreduce > split + distributed::AllreduceOptions opts; + opts.reduce_op = ReduceOp::SUM; + + VLOG(3) << "group [" << curr_group_index << "] start fused_allreduce."; + + // concat tensors + group->ConcatTensors(inner_place_); + + // div nranks + double scaling = 1.0 / nranks_; + paddle::experimental::scale_(group->dense_contents_, scaling, 0.0, false); + + // all_reduce + std::vector reduce_tensors = {group->dense_contents_}; + tasks_.push_back(process_group_->AllReduce(reduce_tensors, opts)); + + if (tasks_.size() == groups_.size()) { + for (size_t index = 0; index < tasks_.size(); index++) { + auto &task = tasks_.back(); + task->Synchronize(); + tasks_.pop_back(); + } + for (size_t index = 0; index < groups_.size(); index++) { + auto &group = groups_[index]; + group.SplitTensors(inner_place_); + } + } +} + +std::ostream &operator<<(std::ostream &out, const EagerGroup &group) { + const auto &tensors_ = group.tensor_indices_; + out << "numel: " << group.all_length_ << " ;var number: " << tensors_.size() + << "\n"; + auto begin = tensors_.begin(); + auto end = tensors_.end(); + out << "["; + for (int i = 0; begin != end && i < 100; ++i, ++begin) { + if (i > 0) out << ' '; + out << *begin; + } + if (begin != end) { + out << " ..."; + } + out << "]\n"; + return out; +} + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h index f8c75385ef8bd6891df8eda6faa93c73091c37f5..ac6f3fbe5956cd47d4385343509d41afec0b69a4 100644 --- a/paddle/fluid/distributed/collective/reducer.h +++ b/paddle/fluid/distributed/collective/reducer.h @@ -17,16 +17,109 @@ #include #include #include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/api/utils/tensor_utils.h" +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/ext_compat_utils.h" +#include "paddle/phi/common/data_type.h" namespace paddle { namespace distributed { using Tensor = paddle::experimental::Tensor; +using Scalar = paddle::experimental::ScalarBase; +using ScalarArray = + paddle::experimental::ScalarArrayBase; std::vector> Eager_AssignGroupBySize( - const std::vector, const std::vector& is_sparse_gradient, - const std::vector& group_size_limits, - const std::vector& tensor_indices = {}); + const std::vector, const std::vector &is_sparse_gradient, + const std::vector &group_size_limits, + const std::vector &tensor_indices = {}); + +class EagerGroup { + public: + Tensor dense_contents_; + + // for concat kernel + std::vector dense_tensors_; + std::vector length_; + int64_t all_length_{0}; + std::vector origin_shapes_; + + // Global indices of participating tensors in the group + std::vector tensor_indices_; + + // Number of params that haven't been ready. When it is 0, it means + // the group is ready. + size_t pending_ = -1; + + // external message of group + phi::DataType dtype_; + + // context is used to select the stream for concat + void ConcatTensors(const platform::Place &); + + // context is used to select the stream for split + void SplitTensors(const platform::Place &); + + friend std::ostream &operator<<(std::ostream &, const EagerGroup &); +}; + +struct TensorLocator { + // record the index in groups_ + size_t group_index; + size_t inside_group_index; +}; + +class EagerReducer { + public: + explicit EagerReducer( + const std::vector tensors, + const std::vector> &group_indices, + const std::vector &is_sparse_gradient, + std::shared_ptr process_group, + const std::vector &group_size_limits, + bool find_unused_parameters); + + virtual ~EagerReducer() {} + + std::shared_ptr GetGradNodeFromTensor(Tensor *tensor); + + void InitializeGroups(const std::vector> &group_indices); + void InitializeDenseGroups(const std::vector &tensor_indices_, + EagerGroup *p_group); + void PrepareForBackward(const std::vector &outputs); + void AddDistHook(size_t var_index); + void MarkVarReady(const size_t var_index, const bool is_used_var); + void MarkGroupReady(const size_t group_index); + void FusedAllReduceSchedule(EagerGroup *group, const int curr_group_index); + + private: + std::vector tensors_; + std::vector> group_indices_; + std::vector is_sparse_gradient_; + std::shared_ptr process_group_; + std::vector group_size_limits_; + bool find_unused_vars_each_step_; + + std::vector groups_; + std::vector variable_locators_; + PlaceType place_; + platform::Place inner_place_; + size_t next_group_ = 0; + int64_t nranks_ = -1; + std::vector> tasks_; + + bool grad_need_hooks_{false}; + + std::vector vars_marked_ready_; + std::vector local_used_vars_; +}; } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 18920d06f38543cc3f7aeb045e7c3058143e006e..ba039385a74ba45aa1f33ba38138d8e5213f2e00 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -24,10 +24,14 @@ limitations under the License. */ #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(fill_constant); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/ps/table/depends/initializers.h b/paddle/fluid/distributed/ps/table/depends/initializers.h index f46e659a88babb07918d02f1e05859829895f2bf..5ac0c08f97d76f6bc1cb77f1f6cd0da77be2385f 100644 --- a/paddle/fluid/distributed/ps/table/depends/initializers.h +++ b/paddle/fluid/distributed/ps/table/depends/initializers.h @@ -23,7 +23,6 @@ #include "gflags/gflags.h" #include "paddle/fluid/framework/generator.h" - #include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { @@ -118,9 +117,13 @@ class TruncatedGaussianInitializer : public Initializer { seed_ = static_cast(std::stoi(attrs[1])); mean_ = std::stof(attrs[2]); std_ = std::stof(attrs[3]); - - std::uniform_real_distribution dist_( - std::numeric_limits::min(), 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean_) / std_); + float b_normal_cdf = normal_cdf((2.0 - mean_) / std_); + std::uniform_real_distribution dist_(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); random_engine_ = framework::GetCPURandomEngine(seed_); } diff --git a/paddle/fluid/distributed/store/store.h b/paddle/fluid/distributed/store/store.h index 2673314d222d2b32e42c42a3a94df71a1887914a..7b4ae7e70ff6f033e038f1c5214f46e0876257d2 100644 --- a/paddle/fluid/distributed/store/store.h +++ b/paddle/fluid/distributed/store/store.h @@ -25,13 +25,26 @@ namespace distributed { class Store { public: - Store() = delete; + Store() : _timeout(tcputils::kNoTimeout) {} explicit Store(const std::chrono::seconds& timeout) : _timeout(timeout) {} virtual ~Store() = default; - virtual int64_t add(const std::string& key, int64_t value) = 0; - virtual std::vector get(const std::string& key) = 0; - virtual void wait(const std::string& key) = 0; + virtual int64_t add(const std::string& key, int64_t value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual std::vector get(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void wait(const std::string& key) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } + virtual void set(const std::string& key, const std::vector& value) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Implement the add method in the subclass.")); + } virtual const std::chrono::seconds& timeout() const { return _timeout; } diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index de85ac0d910e93257a308052ca1fcf193680a183..b0d5add49565ffb19762778ddd44a388b140c0ee 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -27,11 +27,13 @@ namespace detail { constexpr int INFTIME = -1; -std::unique_ptr MasterDaemon::start(SocketType socket) { - return std::make_unique(socket); +std::unique_ptr MasterDaemon::start(SocketType socket, + int nranks) { + return std::make_unique(socket, nranks); } -MasterDaemon::MasterDaemon(SocketType socket) : _listen_socket(socket) { +MasterDaemon::MasterDaemon(SocketType socket, int nranks) + : _listen_socket(socket), _nranks(nranks) { _background_thread = std::thread{&MasterDaemon::run, this}; } @@ -64,27 +66,35 @@ void MasterDaemon::_do_add(SocketType socket) { tcputils::send_value(socket, new_value); } +void MasterDaemon::_do_set(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_set"; + std::string key = tcputils::receive_string(socket); + auto value = tcputils::receive_vector(socket); + _store[key] = value; +} + void MasterDaemon::_do_get(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_get"; std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); PADDLE_ENFORCE_NE( iter, _store.end(), platform::errors::InvalidArgument("Key %s not found in TCPStore.", key)); std::vector value = iter->second; - VLOG(3) << "TCPStore: value (" - << std::stoll(std::string(reinterpret_cast(value.data()), - value.size())) - << ") for key (" << key << ")."; tcputils::send_vector(socket, value); } void MasterDaemon::_do_stop(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_stop"; ReplyType value = ReplyType::STOP_WAIT; - _stop = true; tcputils::send_value(socket, value); + if (--_nranks == 0) { + _stop = true; + } } void MasterDaemon::_do_wait(SocketType socket) { + VLOG(3) << "MasterDaemon::_do_wait"; std::string key = tcputils::receive_string(socket); auto iter = _store.find(key); auto reply = ReplyType::STOP_WAIT; @@ -126,35 +136,47 @@ void MasterDaemon::run() { } for (size_t i = 1; i < fds.size(); i++) { - if (fds[i].revents == 0) { - continue; - } - - Command command = tcputils::receive_value(fds[i].fd); - VLOG(3) << "TCPStore: recv command: " << static_cast(command) << "."; - - switch (command) { - case Command::ADD: - _do_add(fds[i].fd); - break; - case Command::GET: - _do_get(fds[i].fd); - break; - case Command::WAIT: - _do_wait(fds[i].fd); - break; - case Command::STOP: - _do_stop(fds[i].fd); - break; + try { + if (fds[i].revents == 0) { + continue; + } + + Command command = tcputils::receive_value(fds[i].fd); + VLOG(3) << "TCPStore: recv command: " << static_cast(command) + << "."; + + switch (command) { + case Command::ADD: + _do_add(fds[i].fd); + break; + case Command::GET: + _do_get(fds[i].fd); + break; + case Command::SET: + _do_set(fds[i].fd); + break; + case Command::WAIT: + _do_wait(fds[i].fd); + break; + case Command::STOP: + _do_stop(fds[i].fd); + break; + default: + VLOG(0) << "Unknow command: " << static_cast(command); + exit(-1); + } + } catch (...) { + fds.erase(fds.begin() + i); + _sockets.erase(_sockets.begin() + i - 1); } } } } -std::unique_ptr TCPServer::create(uint16_t port) { +std::unique_ptr TCPServer::create(uint16_t port, int nranks) { int socket = tcputils::tcp_listen("", std::to_string(port), AF_INET); auto server = std::make_unique(); - server->_master_daemon = MasterDaemon::start(socket); + server->_master_daemon = MasterDaemon::start(socket, nranks); return server; } @@ -200,7 +222,7 @@ TCPStore::TCPStore(std::string host, uint16_t port, bool is_master, size_t num_workers, std::chrono::seconds timeout) : Store(timeout), _is_master(is_master), _num_workers(num_workers) { if (_is_master) { - _server = detail::TCPServer::create(port); + _server = detail::TCPServer::create(port, num_workers); } _client = detail::TCPClient::connect(host, port); @@ -213,36 +235,41 @@ void TCPStore::waitWorkers() { } add(_init_key, 1); - if (_server) { - auto begin = std::chrono::steady_clock::now(); - do { - auto value = get(_init_key); - int completed = std::stoi(std::string(value.begin(), value.end())); - VLOG(3) << completed << " worker ready, total " << _num_workers; - if (completed >= _num_workers) { - break; - } - const auto elapsed = std::chrono::duration_cast( - std::chrono::steady_clock::now() - begin); - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { - PADDLE_ENFORCE_EQ( - completed, _num_workers, - platform::errors::InvalidArgument( - "TCPStore timeouted and not all workers got ready.")); - } - } while (true); - } + auto begin = std::chrono::steady_clock::now(); + do { + auto value = get(_init_key); + int completed = std::stoi(std::string(value.begin(), value.end())); + VLOG(3) << completed << " worker ready, total " << _num_workers; + if (completed >= _num_workers) { + break; + } + const auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - begin); + + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + if (_timeout != tcputils::kNoTimeout && elapsed > _timeout) { + PADDLE_ENFORCE_EQ( + completed, _num_workers, + platform::errors::InvalidArgument( + "TCPStore timeouted and not all workers got ready.")); + } + } while (true); VLOG(3) << "TCPStore initialized."; } int64_t TCPStore::add(const std::string& key, int64_t value) { + VLOG(3) << "TCPStore add."; _client->send_command_for_key(Command::ADD, _key_prefix + key); _client->send_value(value); return _client->receive_value(); } +void TCPStore::set(const std::string& key, const std::vector& value) { + VLOG(3) << "TCPStore set."; + _client->send_command_for_key(Command::SET, _key_prefix + key); + _client->send_vector(value); +} + std::vector TCPStore::get(const std::string& key) { wait(key); _client->send_command_for_key(Command::GET, _key_prefix + key); @@ -252,6 +279,7 @@ std::vector TCPStore::get(const std::string& key) { void TCPStore::wait(const std::string& key) { ReplyType reply; + VLOG(3) << "TCPStore wait."; do { _client->send_command_for_key(Command::WAIT, _key_prefix + key); @@ -261,6 +289,7 @@ void TCPStore::wait(const std::string& key) { } TCPStore::~TCPStore() { + VLOG(3) << "~TCPStore"; _client->send_command_for_key(Command::STOP, ""); ReplyType ret = _client->receive_value(); PADDLE_ENFORCE_EQ(ret, ReplyType::STOP_WAIT, diff --git a/paddle/fluid/distributed/store/tcp_store.h b/paddle/fluid/distributed/store/tcp_store.h index cd706dd6640acf5e0b5b3714175dac7a6cecb25a..17c1d8ea30a421f04d054d59ac93c8c60406ef68 100644 --- a/paddle/fluid/distributed/store/tcp_store.h +++ b/paddle/fluid/distributed/store/tcp_store.h @@ -27,15 +27,16 @@ namespace paddle { namespace distributed { enum class ReplyType { WAITING, STOP_WAIT }; -enum class Command { ADD, GET, WAIT, STOP }; +enum class Command { ADD, GET, SET, WAIT, STOP }; namespace detail { class MasterDaemon { public: - static std::unique_ptr start(SocketType listen_socket); + static std::unique_ptr start(SocketType listen_socket, + int nranks); MasterDaemon() = delete; - explicit MasterDaemon(SocketType listen_socket); + explicit MasterDaemon(SocketType listen_socket, int nranks); ~MasterDaemon(); private: @@ -43,18 +44,20 @@ class MasterDaemon { void _do_add(SocketType socket); void _do_wait(SocketType socket); void _do_get(SocketType socket); + void _do_set(SocketType socket); void _do_stop(SocketType socket); SocketType _listen_socket; std::vector _sockets; std::unordered_map> _store; std::thread _background_thread{}; + int _nranks; bool _stop = false; }; class TCPServer { public: TCPServer() = default; - static std::unique_ptr create(std::uint16_t port); + static std::unique_ptr create(std::uint16_t port, int nranks); private: std::unique_ptr _master_daemon; @@ -97,6 +100,7 @@ class TCPStore : public Store { int64_t add(const std::string& key, int64_t value) override; std::vector get(const std::string& key) override; void wait(const std::string& key) override; + void set(const std::string& key, const std::vector& value) override; private: void waitWorkers(); diff --git a/paddle/fluid/distributed/store/tcp_utils.cc b/paddle/fluid/distributed/store/tcp_utils.cc index d0561d0b9a9c5b01c32620e72d21ed562e42637e..a28cba288333d7f1c2a705049c29b59f43a70cc5 100644 --- a/paddle/fluid/distributed/store/tcp_utils.cc +++ b/paddle/fluid/distributed/store/tcp_utils.cc @@ -46,9 +46,10 @@ void close_socket(SocketType socket) { hints.ai_socktype = SOCK_STREAM; const char* node = host.empty() ? nullptr : host.c_str(); + const char* port_cstr = port.empty() ? nullptr : port.c_str(); int n; - n = ::getaddrinfo(node, port.c_str(), &hints, &res); + n = ::getaddrinfo(node, port_cstr, &hints, &res); const char* gai_err = ::gai_strerror(n); const char* proto = (family == AF_INET ? "IPv4" : family == AF_INET6 ? "IPv6" : ""); diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 8cb69caf66369655ce751163420b3fcec80dd833..691a381405e9a792d1ee0f256647405a3739e9d8 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,6 +1,7 @@ -set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node custom_operator_node) + set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) -set(generated_deps dygraph_function dygraph_node) +set(generated_deps final_dygraph_function final_dygraph_node dygraph_function dygraph_node) if(NOT ((NOT WITH_PYTHON) AND ON_INFER)) message("Performing Eager Dygraph Auto Code Generation") @@ -9,12 +10,14 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) +add_subdirectory(custom_operator) + -cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api) +cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) -cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api) -cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) +cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor) +cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) add_subdirectory(tests) diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 734cabdc3dc914349e2ad30b657bfb6542a7472a..07fa40165167ce2352018c0e1b1cb08222d5a181 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -24,11 +24,14 @@ class GradNodeAccumulation : public GradNodeBase { public: // Constructor: configure fwd input tensors to grad node explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { + VLOG(6) << "Construct GradNodeAccumulation"; weak_grad_ = meta->WeakGrad(); SetDefaultGradInOutMeta(); } - ~GradNodeAccumulation() override = default; + ~GradNodeAccumulation() override { + VLOG(6) << "Destruct GradNodeAccumulation"; + } // Functor: perform backward computations virtual std::vector> operator()( diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h index c0150a1730d52b3410ba4ea0d31674fbfed596ae..247fde6ed1f869542969b068cdae9f59cedd732a 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h @@ -46,7 +46,7 @@ class GradNodeScale : public GradNodeBase { const std::vector& tensors); void SetAttributes_scale(float scale); - + std::string name() override { return ""; } // Members: define fwd input tensors // For Scale there is no fwd input tensor needed private: diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h index 00578d9a359a3b8d57148efc959de553e811f541..a9a62fcd50e7a0648e695d1f60d52d3f936c53ed 100644 --- a/paddle/fluid/eager/api/utils/global_utils.h +++ b/paddle/fluid/eager/api/utils/global_utils.h @@ -18,7 +18,7 @@ #include #include #include "paddle/fluid/imperative/tracer.h" - +#include "paddle/phi/api/ext/op_meta_info.h" namespace egr { class UniqueNameGenerator { @@ -70,6 +70,21 @@ class Controller { void SetInEagerMode(bool in_eager_mode) { in_eager_mode_ = in_eager_mode; } + const std::unordered_map>& + GetOpMetaInfoMap() { + return op_meta_info_map_; + } + + void MergeOpMetaInfoMap(const std::unordered_map< + std::string, std::vector>& map) { + op_meta_info_map_.insert(map.begin(), map.end()); + } + + std::unordered_map>>& + GetCustomEdgesSlotMap() { + return custom_edges_slot_map_; + } + private: Controller() = default; static Controller* controller_; @@ -77,6 +92,11 @@ class Controller { new paddle::imperative::Tracer()}; // TODO(jiabin): remove when we don't need imperative. bool in_eager_mode_{false}; + std::unordered_map> + op_meta_info_map_; + /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/ + std::unordered_map>> + custom_edges_slot_map_; DISABLE_COPY_AND_ASSIGN(Controller); }; diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 102fad56373803a19f07afc7dda72e9704ac83d5..6a2e5e7ac6cd75068bba4e9b675ab67588c38366 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -47,6 +47,9 @@ std::unordered_map> static std::unordered_map operators_with_attrs = {}; +/* --- Black Ops list that's NO NEED to apply code generation --- */ +static std::unordered_set black_ops_list = {"run_program"}; + static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; std::replace(ret.begin(), ret.end(), '-', '_'); // replace all '-' to '_' @@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type, } static void PrepareAttrMapForOps() { - // Handle "run_program_op" - static framework::ProgramDesc fake_prog; - operators_with_attrs["run_program"] = {}; - operators_with_attrs["run_program"]["global_block"] = - fake_prog.MutableBlock(0); - // Handle "fused_elemwise_add_activation" std::vector functor_list = {"a", "b"}; operators_with_attrs["fused_elemwise_add_activation"] = {}; @@ -996,6 +993,29 @@ static std::string GenerateGradNodeCreationContent( // then generate: "egr::AutogradMeta* p_autograd_out = // egr::EagerUtils::autograd_meta("op_proto->outputs()[0].name()")" std::string get_autograd_meta_str = " // Prepare Autograd Meta \n"; + // If single output slotname and not duplicable, + // then generate: "egr::AutogradMeta* p_autograd_out = + // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" + for (const proto::OpProto::Var& output : out_vars) { + const std::string& output_name = output.name(); + const std::string& output_autograd_name = "p_autograd_" + output_name; + + if (output.duplicable()) { + const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = + " std::vector %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } else { + const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = + " egr::AutogradMeta* %s = " + "egr::EagerUtils::autograd_meta(&%s);\n"; + get_autograd_meta_str += paddle::string::Sprintf( + GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); + } + } + VLOG(6) << "Generated outputs autograd_meta"; + for (const proto::OpProto::Var& input : in_vars) { const std::string& input_name = input.name(); const std::string& input_autograd_name = "p_autograd_" + input_name; @@ -1024,31 +1044,6 @@ static std::string GenerateGradNodeCreationContent( } VLOG(6) << "Generated inputs autograd_meta"; - // If single output slotname and not duplicable, - // then generate: "egr::AutogradMeta* p_autograd_out = - // egr::EagerUtils::autograd_meta("op_proto.outputs()[0].name()")" - for (const proto::OpProto::Var& output : out_vars) { - const std::string& output_name = output.name(); - const std::string& output_autograd_name = "p_autograd_" + output_name; - - // Skip Intermediate Tensor - - if (output.duplicable()) { - const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = - " std::vector %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_MULTI_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } else { - const char* GET_SINGLE_AUTOGRAD_META_TEMPLATE = - " egr::AutogradMeta* %s = " - "egr::EagerUtils::autograd_meta(&%s);\n"; - get_autograd_meta_str += paddle::string::Sprintf( - GET_SINGLE_AUTOGRAD_META_TEMPLATE, output_autograd_name, output_name); - } - } - VLOG(6) << "Generated outputs autograd_meta"; - std::string prepare_autograd_meta_str = ""; prepare_autograd_meta_str += get_autograd_meta_str; prepare_autograd_meta_str += "\n"; @@ -1204,11 +1199,12 @@ static std::string GenerateGradNodeCreationContent( " %s" " bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(%s);\n" " if(require_any_grad) {\n" + " VLOG(6) << \" Construct Grad for %s \"; \n" " egr::EagerUtils::PassStopGradient(%s);\n" "%s\n }"; std::string grad_node_creation_body_str = paddle::string::Sprintf( GRAD_NODE_CREATION_TEMPLATE, prepare_autograd_meta_str, - compute_require_grad_args, pass_stop_gradient_args, + compute_require_grad_args, op_type, pass_stop_gradient_args, grad_node_creation_str); return grad_node_creation_body_str; @@ -1557,9 +1553,23 @@ static std::pair GenerateForwardFunctionContents( core_ops_returns_info[op_type] = return_contents; // [Generation] ComputeRequireGrad -> GradNodeCreation + if (!bwd_info.GenerateForwardOnly()) { std::string grad_node_creation_body_str = GenerateGradNodeCreationContent(fwd_info, bwd_info); + + // Add event record + std::string event_name = op_type + " node_creation"; + const char* NODE_CREATION_TEMPLATE = + "{\n" + " paddle::platform::RecordEvent node_creation_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);\n" + " %s\n" + "}"; + + grad_node_creation_body_str = paddle::string::Sprintf( + NODE_CREATION_TEMPLATE, event_name, grad_node_creation_body_str); + generated_function_body += grad_node_creation_body_str; generated_function_body += "\n"; @@ -1618,10 +1628,20 @@ static std::pair GenerateForwardFunctionContents( if ((*iter) == ',') dygraph_function_args_str.erase(iter); } - const char* FWD_FUNCTION_TEMPLATE = "%s %s(%s) {\n\n%s\n}\n\n"; + const char* DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE = + "paddle::platform::RecordEvent dygraph_entrance_record_event(\"%s\", " + "paddle::platform::TracerEventType::Operator, 1);"; + std::string event_name = op_type + " dygraph"; + std::string fwd_record_event_str = paddle::string::Sprintf( + DYGRAPH_FUNCTION_EVENT_RECORD_FUNCTION_TEMPLATE, event_name); + const char* FWD_FUNCTION_TEMPLATE = + "%s %s(%s) {\n\n" + " %s\n" + " %s\n" + "}\n\n"; std::string fwd_function_str = paddle::string::Sprintf( FWD_FUNCTION_TEMPLATE, function_proto_return_type_str, function_name, - dygraph_function_args_str, generated_function_body); + dygraph_function_args_str, fwd_record_event_str, generated_function_body); // [Generation] Generate forward functions header const char* FWD_HEADER_TEMPLATE = "%s %s(%s);\n"; @@ -2083,22 +2103,24 @@ static std::string GenerateGradNodeHeaderContents( const char* GRAD_NODE_TEMPLATE = "class GradNode%s : public egr::GradNodeBase {\n" " public:\n" - " GradNode%s() : egr::GradNodeBase() {}\n" + " GradNode%s() : egr::GradNodeBase() { VLOG(7) << \" Construct " + "GradNode%s \"; }\n" " GradNode%s(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : " - "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {}\n" - " ~GradNode%s() override = default;\n" + "egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) { VLOG(7) << \" " + "Construct GradNode%s \"; }\n" + " ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n" "\n" " virtual std::vector> " "operator()(const " "std::vector>& grads) " "override;\n" "\n" + " std::string name() override { return \" GradNode%s \"; } \n " + "\n" " // SetX, SetY, ...\n" "%s\n" " // SetAttrMap\n" "%s\n" - " std::string name() { return \"GradNode%s\"; }\n" - "\n" " private:\n" " // TensorWrappers\n" "%s\n" @@ -2195,8 +2217,8 @@ static std::string GenerateGradNodeHeaderContents( VLOG(6) << "Generated TensorWrapper"; std::string grad_node_str = paddle::string::Sprintf( - GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, - set_tensor_wrappers_str, set_attr_map_str, op_type, + GRAD_NODE_TEMPLATE, op_type, op_type, op_type, op_type, op_type, op_type, + op_type, op_type, set_tensor_wrappers_str, set_attr_map_str, tensor_wrapper_members_str, attr_members_str); return grad_node_str; @@ -2242,8 +2264,9 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path, "\"paddle/fluid/eager/api/generated/fluid_generated/" "dygraph_forward_api.h\"\n" "#include " - "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n\n" - "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n"; + "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n" + "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n" + "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n"; std::string forward_cc_include_str = paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE); std::ofstream forward_cc_stream(forward_cc_path, std::ios::out); @@ -2348,6 +2371,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) { if (!CheckOpProto(op_proto)) continue; const std::string& op_type = op_proto->type(); + if (black_ops_list.count(op_type)) { + continue; + } /* ----------------------------- */ /* ---- Collect Information ---- */ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index c6bca01205e19c58d5924f4e9d60bb76164fee2b..53af6c1048d2454b1e9f375b837103930026ae54 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml") -set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml") +set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml") +set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml") set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc") set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h") set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc") diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 02183e2ca5ce9f0996017eb7df59ee716b0f1ae2..656418a05ad6d04bc19838c97d86db9cda19c1c6 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -23,15 +23,18 @@ core_ops_returns_info = {} core_ops_args_info = {} core_ops_args_type_info = {} +namespace = "" yaml_types_mapping = { - 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ - 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ - 'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \ - 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ + 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ + 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ + 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', - 'Tensor[Tensor[]]' : 'std::vector>' + 'Tensor[Tensor[]]' : 'std::vector>', + 'Scalar' : 'paddle::experimental::Scalar', + 'ScalarArray' : 'paddle::experimental::ScalarArray' } @@ -123,6 +126,7 @@ def GetAutoGradMetaVectorName(string): def ReadFwdFile(filepath): f = open(filepath, 'r') contents = yaml.load(f, Loader=yaml.FullLoader) + f.close() return contents @@ -131,15 +135,25 @@ def ReadBwdFile(filepath): contents = yaml.load(f, Loader=yaml.FullLoader) ret = {} for content in contents: - assert 'backward_api' in content.keys() - api_name = content['backward_api'] + if 'backward_api' in content.keys(): + api_name = content['backward_api'] + else: + assert False + ret[api_name] = content + f.close() return ret ###################### ### Yaml Parsers ### ###################### +def RemoveSpecialSymbolsInName(string): + # Remove any name after '@' + ret = string.split("@")[0] + return ret + + def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): # intermediate_outputs : [name0, name1, ...] # forward_returns_list : [[ret_name, type, orig_pos], ...] @@ -158,15 +172,19 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): def ParseDispensable(string): # string: "X, Y" + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseIntermediate(string): + string = RemoveSpecialSymbolsInName(string) return [v.strip() for v in string.split(",")] def ParseNoNeedBuffer(string): # string: "x, y" + string = RemoveSpecialSymbolsInName(string) + no_need_buffer_set = set() for name in string.split(","): no_need_buffer_set.add(name.strip()) @@ -196,6 +214,8 @@ def ParseYamlArgs(string): assert arg_type in yaml_types_mapping.keys() arg_type = yaml_types_mapping[arg_type] + + arg_name = RemoveSpecialSymbolsInName(arg_name) if "Tensor" in arg_type: assert default_value is None inputs_list.append([arg_name, arg_type, i]) @@ -206,40 +226,32 @@ def ParseYamlArgs(string): def ParseYamlReturns(string): - # Example: Tensor, Tensor - - # list = [ ["", ret_type, orig_position], ...] - returns_list = [] - - returns = [x.strip() for x in string.strip().split(",")] - for i in range(len(returns)): - ret = returns[i] - returns_list.append(["", ret, i]) - - return returns_list - - -def ParseYamlReturnsWithName(string): - # Example: Tensor(out), Tensor(out1) + # Example0: Tensor(out), Tensor(out1) + # Example1: Tensor, Tensor + # Example2: Tensor[](out), Tensor # list = [ [ret_name, ret_type, orig_position], ...] returns_list = [] returns = [x.strip() for x in string.strip().split(",")] - atype = r'(.*?)' - aname = r'(.*?)' - pattern = f'{atype}\({aname}\)' for i in range(len(returns)): ret = returns[i] - m = re.search(pattern, ret) - ret_type = m.group(1) - ret_name = m.group(2) + + ret_name = "" + if "(" in ret and ")" in ret: + # Remove trailing ')' + ret = ret[:-1] + ret_type = ret.split("(")[0].strip() + ret_name = ret.split("(")[1].strip() + else: + ret_type = ret.strip() assert ret_type in yaml_types_mapping.keys() ret_type = yaml_types_mapping[ret_type] assert "Tensor" in ret_type + ret_name = RemoveSpecialSymbolsInName(ret_name) returns_list.append([ret_name, ret_type, i]) return returns_list @@ -260,7 +272,7 @@ def ParseYamlForwardFromBackward(string): function_returns = m.group(3) forward_inputs_list, forward_attrs_list = ParseYamlArgs(function_args) - forward_returns_list = ParseYamlReturnsWithName(function_returns) + forward_returns_list = ParseYamlReturns(function_returns) return forward_inputs_list, forward_attrs_list, forward_returns_list @@ -290,7 +302,7 @@ def ParseYamlBackward(args_str, returns_str): args_str = re.search(args_pattern, args_str).group(1) inputs_list, attrs_list = ParseYamlArgs(args_str) - returns_list = ParseYamlReturnsWithName(returns_str) + returns_list = ParseYamlReturns(returns_str) return inputs_list, attrs_list, returns_list @@ -516,11 +528,18 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map, set_attribute_methods_str += SET_ATTR_METHOD_TEMPLATE.format( aname, GetConstReference(atype), aname, saved_attr_name, aname) - ATTRIBUTE_MEMBER_TEMPLATE = """ - {} {} = {}; -""" - attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( - RemoveConstAndReference(atype), saved_attr_name, default_val) + if default_val: + ATTRIBUTE_MEMBER_TEMPLATE = """ + {} {} = {}; + """ + attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name, default_val) + else: + ATTRIBUTE_MEMBER_TEMPLATE = """ + {} {}; + """ + attribute_members_str += ATTRIBUTE_MEMBER_TEMPLATE.format( + RemoveConstAndReference(atype), saved_attr_name) # End: SetAttributes & Attribute Members grad_node_name = GetGradNodeName(fwd_api_name) @@ -534,7 +553,7 @@ class {} : public egr::GradNodeBase {{ virtual std::vector> operator()( const std::vector>& grads) override; - + std::string name() override {{ return \" {} \"; }} // SetTensorWrapperX, SetTensorWrapperY, ... {} // SetAttributes @@ -549,8 +568,9 @@ class {} : public egr::GradNodeBase {{ """ node_declaration_str = NODE_DECLARATION_TEMPLATE.format( grad_node_name, grad_node_name, grad_node_name, grad_node_name, - set_tensor_wrapper_methods_str, set_attribute_methods_str, - tensor_wrapper_members_str, attribute_members_str) + grad_node_name, set_tensor_wrapper_methods_str, + set_attribute_methods_str, tensor_wrapper_members_str, + attribute_members_str) return node_declaration_str @@ -607,16 +627,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) + + if len(namespace) > 0: + grad_api_namespace = f"paddle::experimental::{namespace}" + else: + grad_api_namespace = f"paddle::experimental" + FUNCTION_TEMPLATE = """ std::vector> {}::operator()(const std::vector>& grads) {{ // Call grad_api function - auto grad_api_returns = paddle::experimental::{}({}); + auto grad_api_returns = {}::{}({}); {} }} """ node_definition_str = FUNCTION_TEMPLATE.format( - grad_node_name, bwd_api_name, grad_api_args_str, returns_str) + grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str, + returns_str) return node_definition_str @@ -670,7 +697,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - outputs_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" @@ -698,18 +725,24 @@ def GenerateNodeCreationCodes( # SetTensorWrappers set_tensor_wrappers_list = [] - for name, (_, is_fwd_input, _) in backward_fwd_input_map.items(): + for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items(): is_optional = (name in optional_inputs) + if is_fwd_input: if is_optional: set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" else: set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: + if IsVectorTensorType(atype): + tw_name = f"api_result[{pos}]" + else: + tw_name = f"api_result" + if is_optional: - set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" set_tensor_wrappers_list.append(set_tensor_wrappers) set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) @@ -849,7 +882,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, function_name = fwd_api_name else: function_name = fwd_api_name + "_intermediate" - forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" + + if len(namespace) > 0: + forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});" + else: + forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" # Get return type list & outputs num_outputs = len(forward_outputs_position_map.keys()) - len( @@ -886,8 +923,20 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs) + node_event_name = fwd_api_name + " node_creation" + NODE_CREATION_TEMPLATE = """{{\n + paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n + {}\n + }}""" + node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name, + node_creation_str) + + dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" + FORWARD_FUNCTION_TEMPLATE = """ {} {}({}) {{ + {} + // Forward API Call {} @@ -901,7 +950,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, forward_function_name = GetForwardFunctionName(fwd_api_name) forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( returns_type_str, forward_function_name, inputs_args_definition_str, - forward_call_str, node_creation_str, returns_str) + dygraph_event_str, forward_call_str, node_creation_str, returns_str) forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" return forward_function_str, forward_function_declaration_str @@ -999,7 +1048,9 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/phi/api/include/sparse_api.h" """ file_contents += node_definition_str with open(filepath, 'a') as f: @@ -1020,10 +1071,13 @@ def GenerateNodeHFile(filepath, node_declaration_str): def GenerateForwardCCFile(filepath, forward_definition_str): file_contents = """ +#include "paddle/phi/api/lib/dygraph_api.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" """ @@ -1041,6 +1095,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): #include "paddle/phi/api/all.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/eager/to_static/run_program_op_func.h" """ file_contents += GenerateCoreOpInfoDeclaration() @@ -1052,134 +1107,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - backward_yaml_path = args.backward_yaml_path - - fwd_api_list = ReadFwdFile(api_yaml_path) - grad_api_dict = ReadBwdFile(backward_yaml_path) + api_yaml_paths = args.api_yaml_path.split(",") + backward_yaml_paths = args.backward_yaml_path.split(",") # Generate per Dygraph API node_declaration_str = "" node_definition_str = "" forward_definition_str = "" forward_declaration_str = "" - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - no_need_buffer_set = set() - if 'no_need_buffer' in fwd_api.keys(): - no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer']) - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - bwd_api_name = fwd_api['backward'] - assert bwd_api_name in grad_api_dict.keys() - bwd_api = grad_api_dict[bwd_api_name] - - assert 'args' in bwd_api.keys() - assert 'output' in bwd_api.keys() - assert 'forward' in bwd_api.keys() - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - bwd_forward_str = bwd_api['forward'] - bwd_args_str = bwd_api['args'] - bwd_returns_str = bwd_api['output'] - - # Collect Forward Inputs/Outputs - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( - bwd_forward_str) - print("Parsed Forward Inputs List: ", forward_inputs_list) - print("Prased Forward Attrs List: ", forward_attrs_list) - print("Parsed Forward Returns List: ", forward_returns_list) - - intermediate_outputs = [] - if 'intermediate' in fwd_api.keys(): - intermediate_outputs = ParseIntermediate(fwd_api['intermediate']) - - IntermediateValidationCheck(intermediate_outputs, forward_returns_list) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list) - print("Prased Original Forward Attrs List: ", orig_forward_attrs_list) - print("Parsed Original Forward Returns List: ", - orig_forward_returns_list) - - # Forward Validation Checks - ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, - forward_returns_list, orig_forward_inputs_list, - orig_forward_attrs_list, - orig_forward_returns_list) - - # Parse Backward Inputs/Outputs - backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( - bwd_args_str, bwd_returns_str) - print("Parsed Backward Inputs List: ", backward_inputs_list) - print("Prased Backward Attrs List: ", backward_attrs_list) - print("Parsed Backward Returns List: ", backward_returns_list) - - # Determine Forward Inputs/Outputs Position - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - # SlotName Matching - backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( - backward_inputs_list, backward_returns_list, - forward_inputs_position_map, forward_outputs_position_map) - print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) - print("Generated Backward Grad Input Map: ", backward_grad_input_map) - print("Generated Backward Grad Output Map: ", backward_grad_output_map) - - # Backward Validation Check - BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, - backward_attrs_list) - - # Node Declaration Generation - node_declaration_str += GenerateNodeDeclaration( - fwd_api_name, backward_fwd_input_map, backward_attrs_list, - no_need_buffer_set) - print("Generated Node Declaration: ", node_declaration_str) - - node_definition_str += GenerateNodeDefinition( - fwd_api_name, bwd_api_name, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list) - print("Generated Node Definition: ", node_definition_str) - - # Node Definition Generation - definition_declaration_pair = GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs) - print("Generated Forward Definition: ", forward_definition_str) - print("Generated Forward Declaration: ", forward_declaration_str) - forward_definition_str += definition_declaration_pair[0] - forward_declaration_str += definition_declaration_pair[1] - - # For python-level API dispatch - CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, - forward_attrs_list) + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + backward_yaml_path = backward_yaml_paths[i] + + if "sparse" in api_yaml_path: + assert "sparse" in backward_yaml_path + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + grad_api_dict = ReadBwdFile(backward_yaml_path) + + yaml_forward_definition_str = "" + yaml_forward_declaration_str = "" + yaml_node_declaration_str = "" + yaml_node_definition_str = "" + for fwd_api in fwd_api_list: + # We only generate Ops with grad + if 'backward' not in fwd_api.keys(): + continue + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + assert 'backward' in fwd_api.keys() + + no_need_buffer_set = set() + if 'no_need_buffer' in fwd_api.keys(): + no_need_buffer_set = ParseNoNeedBuffer(fwd_api[ + 'no_need_buffer']) + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + bwd_api_name = fwd_api['backward'] + assert bwd_api_name in grad_api_dict.keys() + bwd_api = grad_api_dict[bwd_api_name] + + assert 'args' in bwd_api.keys() + assert 'output' in bwd_api.keys() + assert 'forward' in bwd_api.keys() + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + bwd_forward_str = bwd_api['forward'] + bwd_args_str = bwd_api['args'] + bwd_returns_str = bwd_api['output'] + + # Collect Forward Inputs/Outputs + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( + bwd_forward_str) + print("Parsed Forward Inputs List: ", forward_inputs_list) + print("Prased Forward Attrs List: ", forward_attrs_list) + print("Parsed Forward Returns List: ", forward_returns_list) + + intermediate_outputs = [] + if 'intermediate' in fwd_api.keys(): + intermediate_outputs = ParseIntermediate(fwd_api[ + 'intermediate']) + + IntermediateValidationCheck(intermediate_outputs, + forward_returns_list) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", + orig_forward_inputs_list) + print("Prased Original Forward Attrs List: ", + orig_forward_attrs_list) + print("Parsed Original Forward Returns List: ", + orig_forward_returns_list) + + # Forward Validation Checks + ForwardsValidationCheck( + forward_inputs_list, forward_attrs_list, forward_returns_list, + orig_forward_inputs_list, orig_forward_attrs_list, + orig_forward_returns_list) + + # Parse Backward Inputs/Outputs + backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( + bwd_args_str, bwd_returns_str) + print("Parsed Backward Inputs List: ", backward_inputs_list) + print("Prased Backward Attrs List: ", backward_attrs_list) + print("Parsed Backward Returns List: ", backward_returns_list) + + # Determine Forward Inputs/Outputs Position + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + # SlotName Matching + backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( + backward_inputs_list, backward_returns_list, + forward_inputs_position_map, forward_outputs_position_map) + print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) + print("Generated Backward Grad Input Map: ", + backward_grad_input_map) + print("Generated Backward Grad Output Map: ", + backward_grad_output_map) + + # Backward Validation Check + BackwardValidationCheck(backward_fwd_input_map, + backward_grad_input_map, + backward_attrs_list) + + # Node Declaration Generation + yaml_node_declaration_str += GenerateNodeDeclaration( + fwd_api_name, backward_fwd_input_map, backward_attrs_list, + no_need_buffer_set) + print("Generated Node Declaration: ", node_declaration_str) + + yaml_node_definition_str += GenerateNodeDefinition( + fwd_api_name, bwd_api_name, backward_fwd_input_map, + backward_grad_input_map, backward_grad_output_map, + backward_attrs_list) + print("Generated Node Definition: ", node_definition_str) + + # Node Definition Generation + definition_declaration_pair = GenerateForwardDefinition( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs) + print("Generated Forward Definition: ", forward_definition_str) + print("Generated Forward Declaration: ", forward_declaration_str) + yaml_forward_definition_str += definition_declaration_pair[0] + yaml_forward_declaration_str += definition_declaration_pair[1] + + # For python-level API dispatch + CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, + forward_attrs_list) + + if len(namespace) > 0: + forward_definition_str += f"""namespace {namespace} {{ + {yaml_forward_definition_str} +}} +""" + + forward_declaration_str += f"""namespace {namespace} {{ + {yaml_forward_declaration_str} +}} +""" + + node_declaration_str += f"""namespace {namespace} {{ + {yaml_node_declaration_str} +}} +""" + + node_definition_str += f"""namespace {namespace} {{ + {yaml_node_definition_str} +}} +""" + + else: + forward_definition_str += yaml_forward_definition_str + forward_declaration_str += yaml_forward_declaration_str + node_declaration_str += yaml_node_declaration_str + node_definition_str += yaml_node_definition_str # Generate Files nodes_h_path = args.nodes_h_path diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 9329dc5ffc9dd0faa36b8ff6a8373387bc2678c7..9b77f0449e01d6555cd3a25f101e4867ccc6ffd3 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,34 +14,28 @@ import os import argparse -from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap + +skipped_fwd_api_names = set(["scale"]) atype_to_parsing_function = { "bool": "CastPyArg2Boolean", "int": "CastPyArg2Int", "long": "CastPyArg2Long", + "int64_t": "CastPyArg2Long", "float": "CastPyArg2Float", "string": "CastPyArg2String", - "bool[]": "CastPyArg2Booleans", - "int[]": "CastPyArg2Ints", - "long[]": "CastPyArg2Longs", - "float[]": "CastPyArg2Floats", - "double[]": "CastPyArg2Float64s", - "string[]": "CastPyArg2Strings" -} - -atype_to_cxx_type = { - "bool": "bool", - "int": "int", - "long": "long", - "float": "float", - "string": "std::string", - "bool[]": "std::vector", - "int[]": "std::vector", - "long[]": "std::vector", - "float[]": "std::vector", - "double[]": "std::vector", - "string[]": "std::vector" + "std::vector": "CastPyArg2Booleans", + "std::vector": "CastPyArg2Ints", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Longs", + "std::vector": "CastPyArg2Floats", + "std::vector": "CastPyArg2Float64s", + "std::vector": "CastPyArg2Strings", + "paddle::experimental::Scalar": "CastPyArg2Scalar", + "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray", + "paddle::experimental::Backend": "CastPyArg2Backend", + "paddle::experimental::DataType": "CastPyArg2DataType", } @@ -55,15 +49,9 @@ def ParseArguments(): return args -def GetCxxType(atype): - if atype not in atype_to_cxx_type.keys(): - assert False - - return atype_to_cxx_type[atype] - - def FindParsingFunctionFromAttributeType(atype): if atype not in atype_to_parsing_function.keys(): + print(f"Unable to find {atype} in atype_to_parsing_function.") assert False return atype_to_parsing_function[atype] @@ -71,7 +59,7 @@ def FindParsingFunctionFromAttributeType(atype): def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, forward_attrs_list, forward_outputs_position_map, - optional_inputs): + optional_inputs, is_forward_only): # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] @@ -98,18 +86,21 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, # Get Attributes for name, atype, _, pos in forward_attrs_list: parsing_function = FindParsingFunctionFromAttributeType(atype) - cxx_type = GetCxxType(atype) key = f"{name}" parse_attributes_str += f" PyObject* {name}_obj = PyTuple_GET_ITEM(args, {pos});\n" - parse_attributes_str += f" {cxx_type} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" + parse_attributes_str += f" {atype} {name} = {parsing_function}({name}_obj, \"{fwd_api_name}\", {pos});\n" dygraph_function_call_list[pos] = f"{name}" dygraph_function_call_str = ",".join(dygraph_function_call_list) + pythonc_event_str = f"paddle::platform::RecordEvent pythonc_record_event(\"{fwd_api_name} pybind_imperative_func\", paddle::platform::TracerEventType::Operator, 1);" + PYTHON_C_FUNCTION_TEMPLATE = """ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) {{ + {} + PyThreadState *tstate = nullptr; try {{ @@ -139,11 +130,20 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ + namespace_str = "" + if len(namespace) > 0: + namespace_str = f"{namespace}::" + + if is_forward_only: + fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name + else: + fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) + python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( - fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, - GetForwardFunctionName(fwd_api_name), dygraph_function_call_str) + fwd_api_name, pythonc_event_str, fwd_api_name, get_eager_tensor_str, + parse_attributes_str, fwd_function_name, dygraph_function_call_str) - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" + python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" return python_c_function_str, python_c_function_reg_str @@ -197,7 +197,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { """ core_ops_infos_registry = """ - ,{\"get_final_state_core_ops_args_info\", + {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, {\"get_final_state_core_ops_args_type_info\", @@ -225,9 +225,17 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #pragma once #include "pybind11/detail/common.h" +#include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/dygraph_api.h" +#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/scalar.h" +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include namespace paddle {{ @@ -257,53 +265,80 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) - - python_c_functions_str = "\n".join(python_c_function_list) - python_c_functions_reg_str = ",\n".join(python_c_function_reg_list) + api_yaml_paths = args.api_yaml_path.split(",") + + python_c_functions_reg_str = "" + python_c_functions_str = "" + + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + + if "sparse" in api_yaml_path: + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + + python_c_function_list = [] + python_c_function_reg_list = [] + for fwd_api in fwd_api_list: + + # We only generate Ops with grad + is_forward_only = False + if 'backward' not in fwd_api.keys(): + is_forward_only = True + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + if fwd_api_name in skipped_fwd_api_names: + continue + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", forward_inputs_list) + print("Prased Original Forward Attrs List: ", forward_attrs_list) + print("Parsed Original Forward Returns List: ", + forward_returns_list) + + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( + fwd_api_name, forward_inputs_position_map, forward_attrs_list, + forward_outputs_position_map, optional_inputs, is_forward_only) + python_c_function_list.append(python_c_function_str) + python_c_function_reg_list.append(python_c_function_reg_str) + print("Generated Python-C Function: ", python_c_function_str) + + # Append Namespace + python_c_functions_reg_str += ",\n".join( + python_c_function_reg_list) + "," + python_c_functions = "\n".join(python_c_function_list) + if len(namespace) > 0: + python_c_functions_str += f"""namespace {namespace} {{ + {python_c_functions} +}} +""" + + else: + python_c_functions_str += python_c_functions python_c_str = GeneratePythonCWrappers(python_c_functions_str, python_c_functions_reg_str) diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index 9e1dc4f2c8c6ba5c1c7d0c49e5d141d1a6c4c6d3..dca76d3b8a0db8c4284960005bfbad33ce23e20d 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -145,8 +145,7 @@ class AutogradMeta : public AbstractAutogradMeta { private: // TODO(jiabin) :Should we use pointer instead of object? std::shared_ptr grad_{ - std::make_shared( - egr::Controller::Instance().GenerateUniqueName("@grad"))}; + std::make_shared()}; // GradNodeBase is base class of all grad op which is a // wrapper for grad op. This class will make grad op easy diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc index 356fdcaf054277085be57491eb1525beeac8d792..1987d024d8f3e34121f54962c45f0f8c1e91b723 100644 --- a/paddle/fluid/eager/backward.cc +++ b/paddle/fluid/eager/backward.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/errors.h" @@ -48,12 +50,16 @@ std::unordered_map getInDegreeMap( } visited.insert(node); + PADDLE_ENFORCE_NOT_NULL( + node, + paddle::platform::errors::Fatal( + "We got null node when we traverse the backward graph, and this " + "should not happened please check your code and contact us.")); // Find and append next nodes const std::vector>& edges = node->GetEdges(); for (const auto& edge_list : edges) { for (const Edge& edge : edge_list) { GradNodeBase* next_node = edge.GetMutableGradNode().get(); - // Next node could be nullptr if it is leaf tensor with no // AccumulationNode attached // Or it could also originated from dispensable inputs @@ -67,13 +73,15 @@ std::unordered_map getInDegreeMap( } } } - return node_in_degree_map; } void RunBackward(const std::vector& tensors, const std::vector& grad_tensors, bool retain_graph) { + paddle::platform::RecordEvent backward_record_event( + "backward", paddle::platform::TracerEventType::Operator, 1); + VLOG(6) << "Start Backward"; // *Gradient Hook should happen at node-level // *Inplace version check should perform at node-level @@ -109,7 +117,8 @@ void RunBackward(const std::vector& tensors, // Prepare GradTensorHolder if (!node_input_buffers_dict.count(grad_node)) { - VLOG(6) << "Create Value for grad input tensor " << i; + VLOG(6) << "Create Value for grad input tensor " << i + << " of grad node: " << grad_node->name(); node_input_buffers_dict[grad_node] = std::make_unique(grad_node->InputMeta()); } @@ -155,19 +164,27 @@ void RunBackward(const std::vector& tensors, VLOG(6) << "Run Backward"; while (!queue.empty()) { GradNodeBase* node = queue.front(); - queue.pop(); + paddle::platform::RecordEvent node_record_event( + std::string(typeid(*node).name()) + " grad_node", + paddle::platform::TracerEventType::Operator, 1); + + if (queue.size() > 1 && node_in_degree_map[node] != 0) { + queue.pop(); + continue; + } + queue.pop(); // Run node: This is where Hook happens PADDLE_ENFORCE( node_input_buffers_dict.count(node), paddle::platform::errors::Fatal( - "Unable to find next node in the InputBuufer" + "Unable to find next node in the GradTensorHolder \n" "Trying to run Node without configuring its GradTensorHolder")); std::unique_ptr node_input_buffer = std::move(node_input_buffers_dict[node]); - VLOG(6) << "Run Backward Kernel with input_buffer"; + VLOG(6) << "Run Backward Kernel with GradTensorHolder"; // Run Pre Backward Node and get outputs std::vector> grad_output_tensors = (*node)(node_input_buffer->Buffers()); @@ -212,9 +229,8 @@ void RunBackward(const std::vector& tensors, if ((!grad_output_tensor.defined() || !grad_output_tensor.initialized())) { - VLOG(6) - << "We get grad_output_tensor with slot: " << i << ", rank: " << j - << " as uninitialized or undefined in both tensor and variable"; + VLOG(6) << "We get grad_output_tensor with slot: " << i + << ", rank: " << j << " as uninitialized or undefined tensor"; } VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i << ", rank: " << j @@ -225,6 +241,8 @@ void RunBackward(const std::vector& tensors, const auto& input_meta = next_node->InputMeta(); auto grad_tensor_holder = std::make_unique(input_meta); + VLOG(6) << "Construct GradTensorHolder for grad node: " + << next_node->name(); node_input_buffers_dict[next_node] = std::move(grad_tensor_holder); } VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first @@ -234,10 +252,12 @@ void RunBackward(const std::vector& tensors, // Update queue node_in_degree_map[next_node]--; - PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0, - paddle::platform::errors::Fatal( - "Detected in-degree value smaller than zero." - "Node's in-degree cannot be negative")); + PADDLE_ENFORCE( + node_in_degree_map[next_node] >= 0, + paddle::platform::errors::Fatal( + "Detected in-degree value smaller than zero. For Node: %s" + "Node's in-degree cannot be negative", + next_node->name())); if (node_in_degree_map[next_node] == 0) { queue.emplace(std::move(next_node)); } diff --git a/paddle/fluid/eager/custom_operator/CMakeLists.txt b/paddle/fluid/eager/custom_operator/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccc9a03a55660772b51dc27bbfa78b7531a369d3 --- /dev/null +++ b/paddle/fluid/eager/custom_operator/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(custom_operator_node SRCS custom_operator_node.cc DEPS phi_tensor phi_api grad_node_info custom_operator op_meta_info) diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc new file mode 100644 index 0000000000000000000000000000000000000000..48ac8c8358afd68cee9d22b8ea0a4e8fd7c3c92e --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/eager/custom_operator/custom_operator_node.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/op_meta_info_helper.h" +#include "paddle/phi/api/ext/op_meta_info.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace egr { +std::vector> RunCustomOpNode:: +operator()( + const std::vector>& grads) { + paddle::CustomOpKernelContext ctx; + auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto grad_outputs_names = paddle::framework::OpMetaInfoHelper::GetOutputs( + egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); + auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_); + auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap(); + + std::vector> tmp_ins( + grad_inputs_name.size()); + VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size() + << ", whose grad_inputs_name size is: " << grad_inputs_name.size(); + for (size_t i = 0; i < grads.size(); i++) { + if (map[1].find(i) != map[1].end()) { + VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i]; + tmp_ins[map[1][i]] = grads[i]; + } + } + + for (auto it : fwd_outs) { + VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + for (auto it : fwd_ins) { + VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first; + tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second)); + } + + VLOG(6) << "Prepare Grad inputs"; + for (const auto& in : tmp_ins) { + ctx.EmplaceBackInputs(in); + } + VLOG(6) << "Prepare Grad attrs"; + ctx.EmplaceBackAttrs(attrs_); + std::vector> outs( + GetEdges().size()); + std::vector> tmp_outs( + grad_outputs_names.size()); + VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size(); + for (size_t i = 0; i < GetEdges().size(); i++) { + if (map[0].find(i) != map[0].end()) { + VLOG(7) << "Insert grad outputs: " << i + << " with size: " << GetEdges()[i].size() + << " to tmp_outputs: " << map[0][i]; + for (size_t j = 0; j < GetEdges()[i].size(); j++) { + outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */ + std::make_shared( + phi::DataType::UNDEFINED), + egr::Controller::Instance().GenerateUniqueName( + "custom_tmp_grad")); + } + tmp_outs[map[0][i]] = outs[i]; + } + } + for (size_t i = 0; i < tmp_outs.size(); i++) { + VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size(); + ctx.EmplaceBackOutputs(tmp_outs[i]); + } + VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_; + + (*paddle::framework::OpMetaInfoHelper::GetKernelFn( + kernel_map.at(op_type_)[1]))(&ctx); + return outs; +} +} // namespace egr diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h new file mode 100644 index 0000000000000000000000000000000000000000..e5ddef9c062149282d790a5fd6bf31b25a20cf5a --- /dev/null +++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h @@ -0,0 +1,77 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" +#include "paddle/fluid/eager/tensor_wrapper.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/utils/any.h" + +namespace egr { +class RunCustomOpNode : public GradNodeBase { + public: + // Constructor: configure fwd input tensors to grad node + explicit RunCustomOpNode(size_t bwd_in_slot_num, size_t bwd_out_slot_num, + const std::string& op_type) + : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) { + VLOG(6) << "Construct RunCustomOpNode for op: " << op_type; + } + + ~RunCustomOpNode() override { + VLOG(6) << "Destruct RunCustomOpNode for op: " << op_type_; + } + + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector>& grads) + override; + + std::string name() { + return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); + } + + static std::vector ConstructTensorWrapper( + const std::vector& fwd_var) { + std::vector res; + for (auto const& var : fwd_var) { + res.emplace_back(var); + } + return res; + } + + static std::vector Recover( + std::vector* fwd_var) { + std::vector res; + for (size_t i = 0; i < fwd_var->size(); i++) { + res.emplace_back(fwd_var->at(i).recover(nullptr)); + } + return res; + } + + void SetAttrs(const std::vector& attr) { attrs_ = attr; } + + public: + std::unordered_map> fwd_outs; + std::unordered_map> fwd_ins; + std::unordered_map grads2grad_in_map; + + private: + std::vector attrs_; + std::string op_type_{""}; +}; + +} // namespace egr diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index b1189106b8f871ab618972ad93e9812ce443e55d..7eb2902d935c4fd8d5990c81fbf6bcf3fd6e6e66 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -25,11 +25,12 @@ #include "glog/logging.h" /** - * Implementation of GradNodeBase, Edge and InputBuffer. + * Implementation of GradNodeBase, Edge and GradTensorHolder. **/ namespace egr { GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) { + VLOG(6) << "Construct GradNodeBase"; bwd_in_meta_.resize(bwd_in_slot_num); bwd_out_meta_.resize(bwd_out_slot_num); // adj_edges has the same num as backward outputs @@ -49,11 +50,15 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { // its pre-ops if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { meta->SetGradNode(std::make_shared(meta)); + VLOG(6) << "Add Edges for slot: " << slot_id + << " which is: " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } @@ -70,7 +75,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { "inputs's slot num.")); if (meta && !meta->StopGradient()) { auto node = meta->GetMutableGradNode(); - if (node) { + if (node && node.get()) { VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index eeac1cca4acf33190ce30613e4a86e99a95b651b..16513f05e0777a8e57f54c925d68867dda656612 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -76,10 +76,10 @@ class GradSlotMeta { class GradNodeBase { public: - GradNodeBase() = default; + GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; } GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num); // TODO(jiabin): Should we have other constructor here? - virtual ~GradNodeBase() = default; + virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; } /** * operator() designed to contian the real backward execution logic, it should diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h index bb84e2dda81bafe624fe7734a0a47391eeb0adfa..535c93ac53b1751d9634476e47f32dc0cbe22708 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h @@ -30,6 +30,7 @@ class GradTestNode : public egr::GradNodeBase { GradTestNode(float val, int in_num, int out_num) : GradNodeBase(in_num, out_num), val_(val) {} GradTestNode() : GradNodeBase() { val_ = 1.0; } + std::string name() override { return "GradTestNode"; } std::vector> operator()( const std::vector>& grads) override { diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 8c6eeca9d3d5d80fd5bfe943ef87ba8640ada4f2..384fdcd6f97c4b318341db68cdd88b644d42d22a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -24,6 +24,8 @@ #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); + // TODO(jiabin): remove nolint here!!! using namespace egr; // NOLINT diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 6c4bf9a4f17e6f88503f0a1d6ec2f3029000b6f0..056c7102f663b93d215e494908d9c95be832068c 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -33,6 +33,16 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT @@ -72,6 +82,47 @@ TEST(Benchmark, EagerScaleCPU) { } } +TEST(Benchmark, EagerMatmulCPU) { + // Prepare Device Contexts + eager_test::InitEnv(paddle::platform::CPUPlace()); + + for (const std::string& mode : {"Accuracy", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cpu.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCPU) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 14e7ce8cfcfb4dea0907cd128873223c8e5859a2..5e790389819f53b250db8797c7a8b3466818abfb 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -32,11 +32,21 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); + TEST(Benchmark, EagerScaleCUDA) { eager_test::InitEnv(paddle::platform::CUDAPlace()); @@ -74,6 +84,50 @@ TEST(Benchmark, EagerScaleCUDA) { } } +TEST(Benchmark, EagerMatmulCUDA) { + paddle::platform::CUDAPlace place; + eager_test::InitEnv(place); + + for (const std::string& mode : {"Accuracy", "WarmUp", "Performance"}) { + paddle::framework::DDim ddimX = phi::make_ddim({2, 2}); + paddle::experimental::Tensor X = CreateTensorWithValue( + ddimX, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 1.0, true); + RetainGradForTensor(X); + + paddle::framework::DDim ddimY = phi::make_ddim({2, 2}); + paddle::experimental::Tensor Y = CreateTensorWithValue( + ddimY, paddle::platform::CUDAPlace(), phi::DataType::FLOAT32, + phi::DataLayout::NCHW, 2.0, true); + RetainGradForTensor(Y); + + if (mode == "Accuracy") { + benchmark_eager_matmul(X, Y, true /* accuracy_check */); + + } else if (mode == "WarmUp") { + benchmark_eager_matmul(X, Y); + + } else if (mode == "Performance") { + auto t_start = std::chrono::high_resolution_clock::now(); +#ifdef WITH_GPERFTOOLS + ProfilerStart("eager_matmul_cuda.out"); +#endif + benchmark_eager_matmul(X, Y); + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + auto t_end = std::chrono::high_resolution_clock::now(); + double elapsed_time_ms = + std::chrono::duration(t_end - t_start).count(); + std::cout << "Duration: " << elapsed_time_ms << " ms" << std::endl; + + } else { + PADDLE_THROW(paddle::platform::errors::Fatal("Unknown benchmark mode")); + } + } +} + TEST(Benchmark, EagerIntermediateMatmulCUDA) { paddle::platform::CUDAPlace place; eager_test::InitEnv(place); @@ -186,7 +240,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index 3292de9363696dae30d853980eca6fb1ba1055cc..b4b47a85f66662347d5e087cd4391979fb6c4250 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -34,6 +34,16 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT); + namespace paddle { namespace imperative { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index e9b7d10070dbf22f10e617d34f143992d19fb659..a3e393b039425e506066b485bc8a8688bff20d96 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -34,8 +34,18 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); + namespace paddle { namespace imperative { @@ -248,7 +258,7 @@ TEST(Benchmark, FluidMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc index 96126fa5466aace442dfb742f9902539916b853e..769bd7f687f4584d44bbfa30b73611a3128289bf 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc @@ -28,6 +28,7 @@ #include "paddle/fluid/eager/utils.h" // Eager Generated +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" // Fluid @@ -67,6 +68,29 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, } } +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, + bool accuracy_check) { + paddle::experimental::Tensor input_tensor0 = X; + + size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs; + for (size_t i = 0; i < max_num_runs; i++) { + input_tensor0 = + matmul_final_state_dygraph_function(input_tensor0, Y, false, false); + } + + std::vector target_tensors = {input_tensor0}; + RunBackward(target_tensors, {}); + + if (accuracy_check) { + // Examine Forward Grad (w.r.t max_num_runs = 2) + eager_test::CompareTensorWithValue(input_tensor0, 16); + // Examine Backward Grad (w.r.t max_num_runs = 2) + eager_test::CompareGradTensorWithValue(X, 16); + eager_test::CompareGradTensorWithValue(Y, 16); + } +} + /* ----------------------------------- */ /* ---- Eager Intermediate Matmul ---- */ /* ----------------------------------- */ diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h index 0086b51b57e152c6da935eacba8d93c0d6ab1a71..86bf13707ed40b0c37ccb54695cca3d165768cb6 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.h @@ -51,15 +51,10 @@ void benchmark_eager_scale(const paddle::experimental::Tensor& tensor, bool accuracy_check = false); /* ---- Eager MatMul ---- */ -/* -void benchmark_eager_matmul(const paddle::experimental::Tensor& X, const -paddle::experimental::Tensor& Y, +void benchmark_eager_matmul(const paddle::experimental::Tensor& X, + const paddle::experimental::Tensor& Y, bool accuracy_check = false); -void benchmark_eager_mlp(const paddle::experimental::Tensor& X, - const std::vector& Ws, - const std::vector& Bs, - bool accuracy_check = false); -*/ + void benchmark_eager_intermediate_matmul(const paddle::experimental::Tensor& X, const paddle::experimental::Tensor& Y, bool accuracy_check = false); diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index a4bc56bd606f3fbb0f9152d58acb5c8edeecf905..0c894ed267fcdd08d44d4df08bfaf0554874aebf 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -30,6 +30,10 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(Backward, SingleNodeEmptyGrad) { diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 524872b2e55638d25697388aa50724f49f6e3818..36594f1aac8cdb131bb77f1396dca19a0c2e8cc0 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(CrossBatchAccumulation, SingleScaleNode) { diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc index 49bbfc77741a5b82ac9a564e25b484e5dabf77a7..dc44d95daac1d9109bbf2a1d04a8a47b081cead9 100644 --- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc +++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc @@ -27,6 +27,10 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(Forward, SingleNode) { diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 5a7bafb2fe37051c0ad054c130d77dd6e05319d2..f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -30,6 +30,13 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +#endif + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 4b7077b13bdd6c48a0a3846656bd3a6337eb9f80..2a5ad53204a6201149bec0b3dac0fa3baf441f2e 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -30,6 +30,12 @@ #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { TEST(Generated, Sigmoid) { diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index 9cda961741f55e9b4b7fc8dac61fe4a7c96567cf..d546df4ed087a99a28096a5336fab3826991534a 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 15b2a62dca751859882e82d46acaa46f27c2c518..56813c498d2410caa452da7a334c393b230c65bf 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -27,6 +27,12 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc index ea821d195099f3d632e0d1b2d4937bac812563c8..24e5da060111f083ef9b65574e75295fa07f8f43 100644 --- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc @@ -23,6 +23,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(TensorUtils, Test) { diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h new file mode 100644 index 0000000000000000000000000000000000000000..9967d8c36900f45fdd76272bc4416df1d30f2a6a --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/fluid/eager/utils.h" + +inline void run_program_dygraph_function( + const std::vector& x, + const std::vector& params, + std::vector& out, // NOLINT + std::vector& step_scope, // NOLINT + std::vector& dout, // NOLINT + const paddle::framework::AttributeMap& attrs) { + VLOG(2) << "start run run_program"; + // Call forward function + RunProgramAPI(x, params, out, step_scope, dout, attrs); + VLOG(2) << "start run run_program grad"; + + // Prepare Autograd Meta + auto deref_out = details::DereferenceTensors(out); + std::vector p_autograd_x = + egr::EagerUtils::nullable_autograd_meta(x); + std::vector p_autograd_params = + egr::EagerUtils::nullable_autograd_meta(params); + std::vector p_autograd_outs = + egr::EagerUtils::nullable_autograd_meta(deref_out); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, &p_autograd_x, &p_autograd_params); + + if (require_any_grad) { + std::vector out_names; + for (auto& t : deref_out) { + out_names.emplace_back(t.name()); + } + + egr::EagerUtils::PassStopGradient(false, &p_autograd_outs); + // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad]) + auto grad_node = std::make_shared(1, 2); + + grad_node->SetFwdOutNames(out_names); + grad_node->SetOut(out); + // Set Attributes + grad_node->SetAttrMap(attrs); + // Set TensorWrappers + grad_node->SetFwdX(x); + grad_node->SetFwdParams(params); + grad_node->SetStepScope(step_scope); + + // Set Grad out rank as same as fwd input and set stop gradient to bwd + grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); + grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + + grad_node->SetGradInMeta(&p_autograd_outs, 0); + // Set Next Edges + grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); + grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); + + egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0); + + // Set History for output set current Grad Node for + egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node); + egr::EagerUtils::CheckAndRetainGrad(deref_out); + } +} diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h new file mode 100644 index 0000000000000000000000000000000000000000..d99624e49324853d513a20a725c1a3d12b6aaab5 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -0,0 +1,481 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tensor_wrapper.h" + +#include "paddle/fluid/operators/run_program_op.h" +#include "paddle/fluid/platform/enforce.h" + +namespace details { +using Tensor = paddle::experimental::Tensor; + +static std::vector DereferenceTensors( + const std::vector &tensor_ptr) { + std::vector res; + for (auto *t : tensor_ptr) { + res.emplace_back(*t); + } + return res; +} + +static std::vector GetTensorsName(const std::vector &ins) { + std::vector in_names; + for (auto &in_t : ins) { + in_names.emplace_back(in_t.name()); + } + return in_names; +} + +static std::vector GetTensorsName( + const std::vector &ins) { + std::vector in_names; + for (auto *in_t : ins) { + in_names.emplace_back(in_t->name()); + } + return in_names; +} + +static void CheckInputVarStatus(const Tensor &tensor) { + PADDLE_ENFORCE_EQ( + tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true, + paddle::platform::errors::InvalidArgument( + "The input tensor %s of " + "RunProgram(Grad)Op holds " + "wrong type. Expect type is DenseTensor.", + tensor.name())); + + PADDLE_ENFORCE_EQ(tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in input tensor %s of " + "RunProgram(Grad)Op " + "is not initialized.", + tensor.name())); +} + +static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, + const Tensor &dst_tensor) { + auto name = dst_tensor.name(); + PADDLE_ENFORCE_EQ(dst_tensor.defined(), true, + paddle::platform::errors::InvalidArgument( + "dst_tensor shall be defined.")); + + if (phi::DenseTensor::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensor %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is DenseTensor", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's internal " + "scope is not initialized.", + name)); + } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensodfr %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is SelectedRows", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + name)); + + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The RunProgram(Grad)Op only support output " + "variable of type LoDTensor or SelectedRows", + name)); + } +} + +static void ShareTensorsIntoScope(const std::vector &tensors, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + auto name = tensors[i].name(); + if (name == "Fake_var" || !tensors[i].is_initialized()) { + continue; + } + auto *var = scope->Var(name); + CheckInputVarStatus(tensors[i]); + // share tensor + auto tensor_base = tensors[i].impl(); + if (phi::DenseTensor::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } + } +} + +static void ShareTensorsFromScope( + const std::vector &tensors, + const paddle::framework::BlockDesc &global_block, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't find them in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. + auto &name = tensors[i]->name(); + if (name == paddle::framework::kEmptyVarName || name == "Fake_var" || + !global_block.HasVar(name)) { + VLOG(2) << "find tensor name is " << name << ", skip it!"; + continue; + } + // NOTE: Here skip not found var is dangerous, if a bug is caused here, + // the result is grad calculation error, which will be very hidden! + auto *var = scope->FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound( + "The output tensor %s is not in " + "RunProgram(Grad)Op'" + "s internal scope.", + name)); + CheckOutputVarStatus(*var, *tensors[i]); + // share tensor + // TODO(dev): Determine Tensor type by scope.var + // auto tensor_base = tensors[i]->impl(); + // if (phi::DenseTensor::classof(tensor_base.get())) { + if (var->IsType()) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + VLOG(2) << "share " << name << " from scope"; + *dst_tensor = src_tensor; + } else if (var->IsType()) { + // } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + *dst_tensor = src_tensor; + } + } +} + +} // namespace details + +inline void RunProgramAPI( + const std::vector &x, + const std::vector ¶ms, + std::vector &out, // NOLINT + std::vector &step_scope, // NOLINT + std::vector &dout, // NOLINT + const paddle::framework::AttributeMap &attrs) { + VLOG(2) << "RunProgramOpKernel Compute"; + auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index")); + auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test")); + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + + // NOTE(chenweihang): In order not to add new variable type, use vector + // here. Originally, here can use scope directly. + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + // Step 2. prepare executor and init persistable variables + + // NOTE(Aurelius84): While training some models, forward can be called many + // times and then apply backpropagation all at once, such as Reinforcement + // Learning. Tensor data in multi-step training should be saved into single + // scope separately. Otherwise, the gradients can be miscalculated because + // always using the Tensor data of the last step in forward. + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + VLOG(2) << "The number of sub scopes before forward: " + << out_scope_vec->front()->kids().size(); + paddle::framework::Scope &scope = global_inner_scope->NewScope(); + + // share input_vars & parameters into scope + details::ShareTensorsIntoScope(x, &scope); + details::ShareTensorsIntoScope(params, &scope); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto input_names = details::GetTensorsName(x); + auto output_names = details::GetTensorsName(out); + auto dout_names = details::GetTensorsName(dout); + auto *program = global_block->Program(); + + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad=*/false, program_id, &scope); + auto ¶llel_executor = cache_info.first; + // all out_vars are skip_eager_var + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, false); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + output_names.begin(), output_names.end()); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + dout_names.begin(), dout_names.end()); + paddle::framework::details::ParseSafeEagerDeletionSkipVars( + *program, end_op_index, output_names, &skip_eager_delete_vars); + } + + // Step 3. run ops + parallel_executor->RunWithoutFetch(skip_eager_delete_vars); + } + // Step 4. Get Output + details::ShareTensorsFromScope(out, *global_block, &scope); + details::ShareTensorsFromScope(dout, *global_block, &scope); + + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + // Step 5. Drop all children scopes while testing. + if (is_test) { + out_scope_vec->front()->DropKids(); + } + VLOG(2) << "The number of sub scopes after forward: " + << out_scope_vec->front()->kids().size(); +#ifdef PADDLE_WITH_MKLDNN + if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); +#endif +} + +inline void RunProgramGradAPI( + const std::vector &x, + const std::vector ¶ms, + const std::vector &out_grad, + const std::vector &step_scope, // NOLINT + const paddle::framework::AttributeMap &attrs, + std::vector &x_grad, // NOLINT + std::vector ¶ms_grad // NOLINT + ) { + // if all output vars are set to stop_gradient, grad op no need to executed + if (x_grad.empty() && params_grad.empty()) return; + + // TODO(dev): Remove this line hard code. And need to deal with the out_grad + // name problem. + // const_cast(out_grad[0]) + // .set_name("matmul_v2_0.tmp_0@GRAD"); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + // NOTE: skip `shape` and `fill_constant` op created by + // fluid.backward.gradients, one forward output will generate one `shape` + // and `fill_constant` + int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2); + int64_t end_op_index = global_block->OpSize(); + + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto sub_scope_num = global_inner_scope->kids().size(); + VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; + PADDLE_ENFORCE_GT(sub_scope_num, 0, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should hold at " + "least one sub scope.")); + + auto &scope = *(global_inner_scope->kids().front()); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto out_grad_names = details::GetTensorsName(out_grad); + // NOTE: after PR22939 [Add double grad] merged, the grad op maker's + // SetOutput will set to None if the input var stop_gradient=True, + // it will cause an NotFound error when ctx.OutputNames() is called + std::vector x_grad_names; + std::vector param_grad_names; + if (!x_grad.empty()) { + x_grad_names = details::GetTensorsName(x_grad); + } + if (!params_grad.empty()) { + param_grad_names = details::GetTensorsName(params_grad); + } + + // Step 2. prepare executor and scope + auto *program = global_block->Program(); + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad*/ true, program_id, &scope); + auto ¶llel_executor = cache_info.first; + + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, true); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names); + + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + x_grad_names.begin(), x_grad_names.end()); + paddle::framework::details::AppendSkipDeletionVars( + param_grad_names, &skip_eager_delete_vars); + } + + details::ShareTensorsIntoScope(out_grad, &scope); + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + + // Step 3. run ops + parallel_executor->RunWithoutFetch( + /*skip_eager_delete_vars=*/skip_eager_delete_vars); + } + + // Step 4. get outputs + details::ShareTensorsFromScope(x_grad, *global_block, &scope); + details::ShareTensorsFromScope(params_grad, *global_block, &scope); + + // Step5. drop current scope + global_inner_scope->DeleteScope(&scope); + VLOG(2) << "The number of sub scopes after backward: " + << global_inner_scope->kids().size(); +} + +class GradNodeRunProgram : public egr::GradNodeBase { + public: + GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + + ~GradNodeRunProgram() override = default; + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector> &grads) + override { + VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; + PADDLE_ENFORCE_EQ( + grads.size(), 1, + paddle::platform::errors::InvalidArgument( + "The out_grads.size() of RunProgramGradOp should be equal to 1.")); + + VLOG(3) << "out_grads[0].size() : " << grads[0].size(); + std::vector x_grad; + std::vector params_grad; + ConstructGradTensors(x_, &x_grad); + ConstructGradTensors(params_, ¶ms_grad); + std::vector x_grad_ptr; + std::vector params_grad_ptr; + for (auto &i : x_grad) { + x_grad_ptr.emplace_back(&i); + } + for (auto &i : params_grad) { + params_grad_ptr.emplace_back(&i); + } + + // auto x_grad_ptr = ConstructGradTensors(x_); + // auto params_grad_ptr = ConstructGradTensors(params_); + + PADDLE_ENFORCE_EQ( + grads[0].size(), fwd_out_names_.size(), + paddle::platform::errors::InvalidArgument( + "The grads[0].size() and fwd_out_names_.size() should be equal.")); + for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + auto &out_grad = egr::EagerUtils::unsafe_autograd_meta(*out_[i])->Grad(); + const_cast(out_grad).set_impl( + grads[0][i].impl()); + + const_cast(grads[0][i]) + .set_name(fwd_out_names_[i] + "@GRAD"); + } + + RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr, + params_grad_ptr); + VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; + return {x_grad, params_grad}; + // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; + } + + // SetAttrMap + void SetAttrMap(const paddle::framework::AttributeMap &attrs) { + attrs_ = attrs; + } + + void SetFwdX(const std::vector &tensors) { + x_ = tensors; + } + + void SetFwdParams(const std::vector &tensors) { + params_ = tensors; + } + + void SetStepScope(const std::vector &scopes) { + step_scope_ = scopes; + } + + void SetFwdOutNames(std::vector out_names) { + fwd_out_names_ = out_names; + } + + void SetOut(const std::vector &out) { + out_ = out; + } + + protected: + void ConstructGradTensors( + const std::vector &fwd_tensors, + std::vector *grad_tensors) { + // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, + // such as: name, tensor type(DenseTensor or SelectedRows). + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + if (phi::DenseTensor::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } else if (phi::SelectedRows::classof(fwd_t.impl().get())) { + grad_tensors->emplace_back(std::make_shared()); + } + auto &grad_t = grad_tensors->back(); + grad_t.set_name(fwd_t.name() + "@GRAD"); + } + } + + void ConstructGradTensors( + const std::vector &fwd_tensors) { + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); + grad_tesnor.set_name(fwd_t.name() + "@GRAD"); + } + } + + private: + // TensorWrappers + std::vector x_; + std::vector params_; + std::vector step_scope_; + + std::vector fwd_out_names_; + std::vector out_; + + // Attribute Map + paddle::framework::AttributeMap attrs_; +}; diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 39861c80522a920502fff91177256a4b7abf6dc6..8a57d2694535e9c27e88416468fe5a67ce020b43 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -122,9 +122,10 @@ paddle::experimental::Tensor* EagerUtils::mutable_grad( void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { - if (dynamic_cast(autograd_meta->GradNode())) { - VLOG(6) << "Warning: Reseting GradNodeAccumulation for leaf tensor is " - "detected"; + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); } autograd_meta->SetGradNode(grad_node); } @@ -132,11 +133,11 @@ void EagerUtils::SetHistory(std::vector* autograd_metas, void EagerUtils::SetHistory(AutogradMeta* autograd_meta, const std::shared_ptr& grad_node) { - if (dynamic_cast(autograd_meta->GradNode())) { - VLOG(6) - << "Warning: Reseting GradNodeAccumulation for leaf tensor is detected"; + if (autograd_meta->GradNode()) { + VLOG(7) << "Should not set grad node twice, original node is:" + << autograd_meta->GradNode()->name() + << "current is: " << grad_node->name(); } - autograd_meta->SetGradNode(grad_node); } diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 02d90b9c6da1e9f5ca72124b8661658fe005e214..5dc3d9e89c557e86f5af821446b82ad691ad5c95 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -440,11 +440,11 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) -cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw) + #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc deleted file mode 100644 index 49a1e0774a6b1a7a1afd154029850ceb52040759..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/custom_kernel.cc +++ /dev/null @@ -1,47 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#if defined _WIN32 || defined __APPLE__ -#else -#define _LINUX -#endif - -#include "paddle/fluid/framework/custom_kernel.h" -#include "paddle/phi/core/custom_kernel.h" - -namespace paddle { -namespace framework { - -void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) { -#ifdef _LINUX - typedef phi::CustomKernelMap& get_custom_kernel_map_t(); - auto* func = reinterpret_cast( - dlsym(dso_handle, "PD_GetCustomKernelMap")); - - if (func == nullptr) { - LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find " - << "PD_GetCustomKernelMap symbol in this lib."; - return; - } - auto& custom_kernel_map = func(); - phi::RegisterCustomKernels(custom_kernel_map); - LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path; -#else - VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux."; -#endif - return; -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index b9e3bee25f6b5377dde7b525138643964fd8366a..478e39b99dcc9935306603a48810d46ba792d3c3 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/op_meta_info_helper.h" @@ -946,15 +947,16 @@ void RegisterOperatorWithMetaInfoMap( ////////////////////// User APIs /////////////////////// // load op api -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) { void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); VLOG(3) << "load custom_op lib: " << dso_name; typedef OpMetaInfoMap& get_op_meta_info_map_t(); auto* get_op_meta_info_map = detail::DynLoad(handle, "PD_GetOpMetaInfoMap"); auto& op_meta_info_map = get_op_meta_info_map(); - RegisterOperatorWithMetaInfoMap(op_meta_info_map, handle); + return op_meta_info_map.GetMap(); } } // namespace framework diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h index 4310b564371822d0238a55b9091f524d8d419966..fef1e82a14fe6e03de40c8376f922f87f64564f8 100644 --- a/paddle/fluid/framework/custom_operator.h +++ b/paddle/fluid/framework/custom_operator.h @@ -20,9 +20,9 @@ limitations under the License. */ namespace paddle { namespace framework { - // Load custom op api: register op after user compiled -void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); +const std::unordered_map>& +LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); // Register custom op api: register op directly void RegisterOperatorWithMetaInfoMap( @@ -31,6 +31,5 @@ void RegisterOperatorWithMetaInfoMap( // Interface for selective register custom op. void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, void* dso_handle = nullptr); - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 66dfb81755f1c9cc16ab8a52df429af8d94ab718..948eaab40b4f64f2a87a83fab80d4eade5288e91 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + fix_op_run_order_pass fuse_gemm_epilogue_pass) if (WITH_CINN) set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index c99200ec98aa8f0736610f659d3b94e3c2f1e023..fdf74d2f769fcdd49da19c0118a23d6b8fbb06e4 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -1,4 +1,5 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -175,6 +176,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); #endif + +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) + AppendPassWithCheck(strategy_.fuse_gemm_epilogue_, + "fuse_gemm_epilogue_pass"); +#endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); // for single card training, fuse_all_reduce_ops is unnecessary. @@ -507,3 +513,6 @@ USE_PASS(mkldnn_placement_pass); !defined(_WIN32) && !defined(__APPLE__) USE_PASS(fusion_group_pass); #endif +#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060) +USE_PASS(fuse_gemm_epilogue_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 70a083dd70bc3b48bf24b050673f3da7b69b1755..5eb584aaefa981ab6c6f25df7a765ae9a3d0194a 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -1,4 +1,5 @@ // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -124,6 +125,8 @@ struct BuildStrategy { paddle::optional fuse_broadcast_ops_{paddle::none}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; + // Fuse GEMM+Epilogue via cublasLt epilogue. + bool fuse_gemm_epilogue_{false}; // mkldnn_enabled_op_types specify the operator type list to // use MKLDNN acceleration. It is null in default, means diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index b7cb2ce0f0102bd34940864960118f396c5dcad7..59220fc9cdaf1f05f70e8cfe961071c1fad3a760 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -186,45 +186,63 @@ void HashTable::insert(const KeyType* d_keys, size_t len, template void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { container_->prefetch(cudaCpuDeviceId, stream); + std::vector threads; size_t num = container_->size(); KeyType unuse_key = std::numeric_limits::max(); thrust::pair* kv = container_->data(); - for (size_t i = 0; i < num; ++i) { - if (kv[i].first == unuse_key) { - continue; - } - ValType& gpu_val = kv[i].second; + + int thread_num = 8; + int len_per_thread = num / thread_num; + int remain = num % thread_num; + int begin = 0; + + auto dump_func = [unuse_key, kv](int left, int right) { + for (int i = left; i < right; i++) { + if (kv[i].first == unuse_key) { + continue; + } + ValType& gpu_val = kv[i].second; #ifdef PADDLE_WITH_PSLIB - auto* downpour_value = - (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); - int downpour_value_size = downpour_value->size(); - if (gpu_val.mf_size > 0 && downpour_value_size == 7) { - downpour_value->resize(gpu_val.mf_size + downpour_value_size); - } - float* cpu_val = downpour_value->data(); - // cpu_val[0] = 0; - cpu_val[1] = gpu_val.delta_score; - cpu_val[2] = gpu_val.show; - cpu_val[3] = gpu_val.clk; - cpu_val[4] = gpu_val.lr; - cpu_val[5] = gpu_val.lr_g2sum; - cpu_val[6] = gpu_val.slot; - if (gpu_val.mf_size > 0) { - for (int x = 0; x < gpu_val.mf_size; x++) { - cpu_val[x + 7] = gpu_val.mf[x]; + auto* downpour_value = + (paddle::ps::DownpourFixedFeatureValue*)(gpu_val.cpu_ptr); + int downpour_value_size = downpour_value->size(); + if (gpu_val.mf_size > 0 && downpour_value_size == 7) { + downpour_value->resize(gpu_val.mf_size + downpour_value_size); + } + float* cpu_val = downpour_value->data(); + // cpu_val[0] = 0; + cpu_val[1] = gpu_val.delta_score; + cpu_val[2] = gpu_val.show; + cpu_val[3] = gpu_val.clk; + cpu_val[4] = gpu_val.lr; + cpu_val[5] = gpu_val.lr_g2sum; + cpu_val[6] = gpu_val.slot; + if (gpu_val.mf_size > 0) { + for (int x = 0; x < gpu_val.mf_size; x++) { + cpu_val[x + 7] = gpu_val.mf[x]; + } } - } #endif #ifdef PADDLE_WITH_PSCORE - auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); - downpour_value->count_ = gpu_val.show; - for (int x = 0; x < gpu_val.mf_size; x++) { - downpour_value->data_[x] = gpu_val.mf[x]; - } + auto* downpour_value = (paddle::distributed::VALUE*)(gpu_val.cpu_ptr); + downpour_value->count_ = gpu_val.show; + for (int x = 0; x < gpu_val.mf_size; x++) { + downpour_value->data_[x] = gpu_val.mf[x]; + } #endif + } + }; + + for (int i = 0; i < thread_num; i++) { + threads.push_back(std::thread( + dump_func, begin, begin + len_per_thread + (i < remain ? 1 : 0))); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); } - container_->prefetch(devid, stream); + // container_->prefetch(devid, stream); } template diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 9f2bdeffecf62764f5cbe5bea9cb50d4830be43b..c1f8041cc1eca34b858608ffb77598ce095d0b4f 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -231,19 +231,19 @@ void CustomDeviceUnsafeFastGarbageCollector::ClearCallback( CustomStreamGarbageCollector::CustomStreamGarbageCollector( const platform::CustomPlace &place, size_t max_memory_size) : GarbageCollector(place, max_memory_size) { - platform::DeviceGuard guard(place); - stream_.reset(new platform::stream::Stream); + phi::DeviceGuard guard(place); + stream_.reset(new phi::stream::Stream); stream_->Init(place); - callback_manager_.reset(new platform::CallbackManager(stream_.get())); + callback_manager_.reset(new phi::CallbackManager(stream_.get())); } CustomStreamGarbageCollector::~CustomStreamGarbageCollector() { - platform::DeviceGuard guard(this->dev_ctx_->GetPlace()); + phi::DeviceGuard guard(this->dev_ctx_->GetPlace()); stream_->Synchronize(); stream_->Destroy(); } -platform::stream::Stream *CustomStreamGarbageCollector::stream() const { +phi::stream::Stream *CustomStreamGarbageCollector::stream() const { return stream_.get(); } diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index a67860c6087e0f173e09d2a7c131703260c562fd..f0027c676050b8c31c0bc0ca4ab3b6444f29e1a2 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -230,14 +230,14 @@ class CustomStreamGarbageCollector : public GarbageCollector { void Wait() const override; - platform::stream::Stream *stream() const; + phi::stream::Stream *stream() const; protected: void ClearCallback(const std::function &callback) override; private: - std::unique_ptr stream_; - std::unique_ptr callback_manager_; + std::unique_ptr stream_; + std::unique_ptr callback_manager_; }; #endif diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 57fb68e80427afa56372bebb31ff5822135858b6..b1d7059f311cd370a40e83d7b0016d5af8cdb163 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -90,6 +90,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { bool IsForInferShape() const override { return true; } + bool IsRuntime() const override { return ctx_.IsRuntime(); } + private: const InferShapeContext& ctx_; }; @@ -232,16 +234,8 @@ class CompatMetaTensor : public phi::MetaTensor { } } - void share_meta(const MetaTensor& meta_tensor) override { + void share_dims(const MetaTensor& meta_tensor) override { set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - - // special case 1: share lod of LoDTensor - share_lod(meta_tensor); - - // special case 2: share height and rows of SelectedRows in runtime if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); if (var->IsType()) { @@ -254,6 +248,16 @@ class CompatMetaTensor : public phi::MetaTensor { } } + void share_meta(const MetaTensor& meta_tensor) override { + share_dims(meta_tensor); + set_dtype(meta_tensor.dtype()); + // VarDesc doesn't contains layout, so we cannot share layout + // set_layout(meta_tensor.layout()); + + // special case: share lod of LoDTensor + share_lod(meta_tensor); + } + private: const LoD& GetRuntimeLoD() const { auto* var = BOOST_GET_CONST(Variable*, var_); @@ -293,7 +297,8 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, VLOG(3) << "BuildInferMetaContext: op kernel signature - " << signature; // 2. build infermeta context - phi::InferMetaContext infer_meta_context(ctx->IsRuntime()); + phi::InferMetaContext infer_meta_context( + {ctx->IsRuntime(), ctx->IsRunMKLDNNKernel()}); auto& input_names = std::get<0>(signature.args); auto& attr_names = std::get<1>(signature.args); @@ -381,6 +386,10 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr(std::move( phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + infer_meta_context.EmplaceBackAttr(std::move( + phi::ScalarArray(BOOST_GET_CONST(std::vector, attr)))); } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) { infer_meta_context.EmplaceBackAttr( @@ -491,8 +500,22 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, "Unsupported attribute type is received when call " "InferShapeFunctor.")); } - } else { - // do nothing + } else if (ctx->HasInput(attr_name)) { + // convert from data + if (attr_defs[i].type_index == std::type_index(typeid(int32_t))) { + if (ctx->IsRuntime()) { + const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name); + auto var_temp = BOOST_GET_CONST(Variable*, infershape_inputs[i]); + auto val = experimental::MakePhiScalarFromVar(*var_temp); + int32_t val_int = val.template to(); + infer_meta_context.EmplaceBackAttr(val_int); + } else { + infer_meta_context.EmplaceBackAttr(-1); + } + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Get value from variable only support int yet")); + } } } diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h index 64c8371d583ffef621e5009504d14308dd7b997c..b692b6ffab08014f7de6ef4e5488445204396853 100644 --- a/paddle/fluid/framework/infershape_utils.h +++ b/paddle/fluid/framework/infershape_utils.h @@ -29,7 +29,7 @@ namespace framework { phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, const std::string& op_type); -#define DELCARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ +#define DECLARE_INFER_SHAPE_FUNCTOR(op_type, functor_name, fn) \ struct functor_name : public paddle::framework::InferShapeBase { \ void operator()( \ paddle::framework::InferShapeContext* ctx) const override { \ diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc index 53dcc19fcbae88ab5ccfcc498037327946029927..2eeefb19a1aa8c5c9e4f92ff06618c719bb30785 100644 --- a/paddle/fluid/framework/infershape_utils_test.cc +++ b/paddle/fluid/framework/infershape_utils_test.cc @@ -110,9 +110,9 @@ void InferShapeUtilsTestKernel( } // namespace framework } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, +DECLARE_INFER_SHAPE_FUNCTOR(infer_shape_utils_test, InferShapeUtilsTestInferShapeFunctor, - PT_INFER_META(paddle::framework::TestInferMeta)); + PD_INFER_META(paddle::framework::TestInferMeta)); REGISTER_OPERATOR(infer_shape_utils_test, paddle::framework::InferShapeUtilsTestOp, paddle::framework::InferShapeUtilsTestOpMaker, diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0d53a54ff822ae4dde9fcba7c2559569c7e1bd4f..623c8a048c2417ab51772c55b681031d9bcfd925 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -126,6 +126,7 @@ if(WITH_MKLDNN) pass_library(interpolate_mkldnn_pass inference DIR mkldnn) pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn) pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn) pass_library(cpu_quantize_placement_pass base DIR mkldnn) pass_library(cpu_quantize_pass inference DIR mkldnn) pass_library(cpu_quantize_squash_pass inference DIR mkldnn) @@ -157,6 +158,7 @@ endif() cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) +cc_library(fuse_gemm_epilogue_pass SRCS fuse_gemm_epilogue_pass.cc DEPS pass graph_pattern_detector ) cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f48224cbdc24fe9706a3c4eae029c6dc35381ad2 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.cc @@ -0,0 +1,471 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h" +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseGemmEpiloguePass::ApplyImpl(ir::Graph *graph) const { + EpiloguePassActivationCache cache; + + graph = FuseLinearActFwd(graph, {"relu", "gelu"}, false, false, &cache); + graph = FuseLinearActFwd(graph, {"relu"}, true, true, &cache); + graph = FuseLinearActFwd(graph, {"gelu"}, true, false, &cache); + graph = FuseLinearFwd(graph, false); + graph = FuseLinearFwd(graph, true); + graph = FuseLinearActBwd(graph, {"relu_grad"}, true, &cache); + graph = FuseLinearActBwd(graph, {"gelu_grad"}, false, &cache); + graph = FuseLinearBwd(graph, false); + graph = FuseLinearBwd(graph, true); +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearFwd(ir::Graph *graph, + bool is_training) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, {}, is_training, false); + + int found_linear_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + std::string activation = "none"; + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {ele_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, ele_out); + + GraphSafeRemoveNodes(g, {matmul_op, matmul_out, ele_add_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name(); + found_linear_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActFwd( + ir::Graph *graph, const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "x")) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + patterns::LinearAct linear_act_pattern(gpd.mutable_pattern(), "linear_act"); + + linear_act_pattern(x, act_types, is_training, is_act_grad_x_from_act); + + int found_linear_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle LinearAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_w, matmul_w, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_op, ele_add, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_bias, ele_bias, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_op, act, linear_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, linear_act_pattern); + + std::vector matmul_x_shape = subgraph.at(x)->Var()->GetShape(); + std::vector matmul_w_shape = matmul_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2 from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_op_desc = matmul_op->Op(); + if (!IsGemmFromLinear_(matmul_x_shape, matmul_w_shape, matmul_op_desc)) + return; + + auto activation = act_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_op_desc(matmul_op->Op()->Block()); + fused_gemm_epilogue_op_desc.SetType("fused_gemm_epilogue"); + fused_gemm_epilogue_op_desc.SetInput("X", {subgraph.at(x)->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Y", {matmul_w->Name()}); + fused_gemm_epilogue_op_desc.SetInput("Bias", {ele_bias->Name()}); + fused_gemm_epilogue_op_desc.SetOutput("Out", {act_out->Name()}); + fused_gemm_epilogue_op_desc.SetAttr("activation", activation); + fused_gemm_epilogue_op_desc.SetAttr("op_role", + matmul_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_node = g->CreateOpNode(&fused_gemm_epilogue_op_desc); + + IR_NODE_LINK_TO(subgraph.at(x), gemm_epilogue_node); + IR_NODE_LINK_TO(matmul_w, gemm_epilogue_node); + IR_NODE_LINK_TO(ele_bias, gemm_epilogue_node); + IR_NODE_LINK_TO(gemm_epilogue_node, act_out); + + // Only need to check weight.shape[1] for auxiliary pointer + // and mark it the act op is fused for backward epilogue fusion. + // That because cuBlasLt epilogue's restriction. + if (is_training) { + int divisor_of_n = activation == "relu" ? 128 : 8; + if (matmul_w_shape[1] % divisor_of_n) return; + + VarDesc reserve_space(patterns::PDNodeName(scope_name, "ReserveSpace")); + auto *reserve_space_node = g->CreateVarNode(&reserve_space); + + cache->InsertFusedActivation( + GetReserveSpaceCacheKey(act_out->Var()->Name(), g->GetBlockId()), + reserve_space_node); + + gemm_epilogue_node->Op()->SetOutput("ReserveSpace", + {reserve_space_node->Name()}); + + if (!is_act_grad_x_from_act) { + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, linear_act_pattern); + act_grad_op->Op()->RenameInput(ele_out->Name(), + reserve_space_node->Name()); + IR_NODE_LINK_TO(reserve_space_node, act_grad_op); + } + IR_NODE_LINK_TO(gemm_epilogue_node, reserve_space_node); + } + + GraphSafeRemoveNodes(g, + {matmul_op, matmul_out, ele_add_op, ele_out, act_op}); + + VLOG(4) << "\n\t " << subgraph.at(x)->Name() << " and " << matmul_w->Name() + << " -> " << matmul_op->Name() << " -> " << matmul_out->Name() + << "\n\t " << matmul_out->Name() << " and " << ele_bias->Name() + << " -> " << ele_add_op->Name() << " -> " << ele_out->Name() + << "\n\t " << ele_out->Name() << " -> " << act_op->Name() << " -> " + << act_out->Name(); + found_linear_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_linear_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearBwd(ir::Graph *graph, + bool without_x_gradient) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, {}, without_x_gradient, false); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + + Node *matmul_grad_dx = nullptr; + if (!without_x_gradient) { + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx_ptr, matmul_grad_dx, + ele_add_matmul_act_pattern); + matmul_grad_dx = matmul_grad_dx_ptr; + } + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + std::string activation_grad = "none"; + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + if (matmul_grad_dx) { + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", + {matmul_grad_dx->Name()}); + } + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + if (matmul_grad_dx) { + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dx); + } + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op}); + + std::string matmul_grad_dx_name = + matmul_grad_dx != nullptr ? matmul_grad_dx->Name() : " "; + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_w->Name() << " and " << matmul_grad_dx_name; + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +ir::Graph *FuseGemmEpiloguePass::FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + const std::string scope_name("gemm_epilogue"); + FusePassBase::Init(scope_name, graph); + + GraphPatternDetector gpd; + auto *dout = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(scope_name, "dout")) + ->AsInput() + ->assert_is_op_input("elementwise_add_grad", GradVarName("Out")); + + patterns::ElewiseAddMatmulAct ele_add_matmul_act_pattern( + gpd.mutable_pattern(), "ele_add_matmul_act"); + ele_add_matmul_act_pattern(dout, act_grad_types, false, + is_act_grad_x_from_act); + + int found_ele_add_matmul_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle ElewiseAddMatmulAct fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad_op, ele_add_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_bias, ele_grad_bias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dx, ele_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_grad_dbias, ele_grad_dbias, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_op, matmul_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_x, matmul_grad_x, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_w, matmul_grad_w, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dx, matmul_grad_dx, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_grad_dw, matmul_grad_dw, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_op, act_grad, + ele_add_matmul_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad_dx, act_grad_dx, + ele_add_matmul_act_pattern); + + auto key = + GetReserveSpaceCacheKey(matmul_grad_x->Var()->Name(), g->GetBlockId()); + if (!cache->HasFusedActivation(key)) { + return; + } + auto *reserve_space_node = cache->GetFusedActivationSpace(key); + + std::vector matmul_grad_x_shape = matmul_grad_x->Var()->GetShape(); + std::vector matmul_grad_w_shape = matmul_grad_w->Var()->GetShape(); + + // Note (Ming Huang): We only support matmul_v2_grad from paddle.nn.Linear + // currently. The conditions below are used to verify wether matmul_v2 + // is created by paddle.nn.Linear + auto matmul_grad_op_desc = matmul_grad_op->Op(); + if (!IsGemmFromLinear_(matmul_grad_x_shape, matmul_grad_w_shape, + matmul_grad_op_desc)) + return; + + auto activation_grad = act_grad_op->Op()->Type(); + + OpDesc fused_gemm_epilogue_grad_op_desc(ele_add_grad_op->Op()->Block()); + fused_gemm_epilogue_grad_op_desc.SetType("fused_gemm_epilogue_grad"); + fused_gemm_epilogue_grad_op_desc.SetInput("DOut", + {subgraph.at(dout)->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("X", {matmul_grad_x->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("Y", {matmul_grad_w->Name()}); + fused_gemm_epilogue_grad_op_desc.SetInput("ReserveSpace", + {reserve_space_node->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DX", {act_grad_dx->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DY", {matmul_grad_dw->Name()}); + fused_gemm_epilogue_grad_op_desc.SetOutput("DBias", + {ele_grad_dbias->Name()}); + fused_gemm_epilogue_grad_op_desc.SetAttr("activation_grad", + activation_grad); + fused_gemm_epilogue_grad_op_desc.SetAttr( + "op_role", matmul_grad_op_desc->GetAttr("op_role")); + + auto gemm_epilogue_grad_node = + g->CreateOpNode(&fused_gemm_epilogue_grad_op_desc); + + IR_NODE_LINK_TO(subgraph.at(dout), gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_x, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(matmul_grad_w, gemm_epilogue_grad_node); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, act_grad_dx); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, matmul_grad_dw); + IR_NODE_LINK_TO(gemm_epilogue_grad_node, ele_grad_dbias); + IR_NODE_LINK_TO(reserve_space_node, gemm_epilogue_grad_node); + + GraphSafeRemoveNodes(g, {ele_add_grad_op, ele_grad_dx, matmul_grad_op, + matmul_grad_dx, act_grad_op}); + + VLOG(4) << "\n\t " << subgraph.at(dout)->Name() << " and " + << ele_grad_bias->Name() << " -> " << ele_add_grad_op->Name() + << " -> " << ele_grad_dx->Name() << " and " + << ele_grad_dbias->Name() << "\n\t " << ele_grad_dx->Name() << ", " + << matmul_grad_x->Name() << " and " << matmul_grad_w->Name() + << " -> " << matmul_grad_op->Name() << " -> " + << matmul_grad_dx->Name() << " and " << matmul_grad_w->Name() + << "\n\t " << matmul_grad_dx->Name() << " -> " + << act_grad_op->Name() << " -> " << act_grad_dx->Name(); + found_ele_add_matmul_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_ele_add_matmul_act_count); + return graph; +} + +bool FuseGemmEpiloguePass::IsGemmFromLinear_( + const std::vector &x_shape, const std::vector &w_shape, + OpDesc *matmul_v2_op) const { + if (w_shape.size() != 2 || x_shape.size() < 2) return false; + for (auto attr_name : + {"fused_reshape_Out", "fused_reshape_X", "fused_reshape_Y", + "fused_transpose_Out", "fused_transpose_X", "fused_transpose_Y"}) { + if (matmul_v2_op->HasAttr(attr_name)) { + std::vector tmp_vec = + BOOST_GET_CONST(std::vector, matmul_v2_op->GetAttr(attr_name)); + if (tmp_vec.size() > 0) return false; + } + } + if (BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_x")) || + BOOST_GET_CONST(bool, matmul_v2_op->GetAttr("trans_y"))) + return false; + + return true; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_gemm_epilogue_pass, + paddle::framework::ir::FuseGemmEpiloguePass); diff --git a/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..575ffee73d60e9bd5d4f5af7538d01789268cc9a --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_gemm_epilogue_pass.h @@ -0,0 +1,100 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the ElewiseAdd and activation + */ +class Graph; +class Node; + +class EpiloguePassActivationCache { + public: + EpiloguePassActivationCache() {} + + EpiloguePassActivationCache(const EpiloguePassActivationCache &) = delete; + void operator=(const EpiloguePassActivationCache &) = delete; + + bool HasFusedActivation(const std::string &key) const { + return fused_activation_space_map_.count(key); + } + + ir::Node *GetFusedActivationSpace(const std::string &key) { + if (HasFusedActivation(key)) { + return fused_activation_space_map_.find(key)->second; + } + PADDLE_THROW(platform::errors::InvalidArgument( + "The key (%d) of EpiloguePassActivationCache does not exist.", key)); + } + + void InsertFusedActivation(const std::string &key, ir::Node *const value) { + if (!HasFusedActivation(key)) { + mtx.lock(); + fused_activation_space_map_.insert({key, value}); + mtx.unlock(); + } else { + PADDLE_THROW(platform::errors::AlreadyExists( + "The key (%d) of EpiloguePassActivationCache already exist.", key)); + } + } + + private: + std::unordered_map fused_activation_space_map_; + std::mutex mtx; +}; + +class FuseGemmEpiloguePass : public FusePassBase { + public: + virtual ~FuseGemmEpiloguePass() {} + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + ir::Graph *FuseLinearFwd(ir::Graph *graph, bool is_training) const; + ir::Graph *FuseLinearActFwd(ir::Graph *graph, + const std::unordered_set &act_types, + bool is_training, bool is_act_grad_x_from_act, + EpiloguePassActivationCache *cache) const; + ir::Graph *FuseLinearBwd(ir::Graph *graph, bool without_x_gradient) const; + ir::Graph *FuseLinearActBwd( + ir::Graph *graph, const std::unordered_set &act_grad_types, + bool is_act_grad_x_from_act, EpiloguePassActivationCache *cache) const; + + private: + bool IsGemmFromLinear_(const std::vector &x_shape, + const std::vector &w_shape, + OpDesc *matmul_v2_op) const; + const std::string GetReserveSpaceCacheKey(const std::string var_name, + int block_id) const { + return std::to_string(block_id) + var_name; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index e4c9dc72128f4850b2e0e4af739fdd381e4a3b1e..18068e22b7f3c31d59636bc7ab6a234e109d5ee6 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -918,6 +918,36 @@ PDNode *patterns::ConvActivation::operator()( return activation_out_var; } +PDNode *patterns::ElementwiseActivation::operator()( + paddle::framework::ir::PDNode *elementwise_a, + const std::string &elementwise_type, const std::string &activation_type) { + // Create Operators + elementwise_a->assert_is_op_input(elementwise_type, "X"); + auto *elementwise_op = + pattern->NewNode(elementwise_repr())->assert_is_op(elementwise_type); + auto *activation_op = + pattern->NewNode(activation_repr())->assert_is_op(activation_type); + // Create variables + auto *elementwise_b = pattern->NewNode(elementwise_b_repr()) + ->AsInput() + ->assert_is_op_input(elementwise_type, "Y"); + // intermediate variable, will be removed in the IR after fuse. + auto *elementwise_out_var = + pattern->NewNode(elementwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(elementwise_type) + ->assert_is_op_input(activation_type); + // output + auto *activation_out_var = pattern->NewNode(activation_out_repr()) + ->AsOutput() + ->assert_is_op_output(activation_type); + + elementwise_op->LinksFrom({elementwise_a, elementwise_b}) + .LinksTo({elementwise_out_var}); + activation_op->LinksFrom({elementwise_out_var}).LinksTo({activation_out_var}); + return activation_out_var; +} + PDNode *patterns::SeqConvEltAddRelu::operator()( paddle::framework::ir::PDNode *seqconv_input) { // Create Operators @@ -1461,31 +1491,6 @@ PDNode *patterns::BatchNormAddActGrad::operator()( return bn_grad; } -PDNode *patterns::ElewiseAddAct::operator()( - paddle::framework::ir::PDNode *ele_x_var, - std::unordered_set act_types) { - auto *ele_y_var = pattern->NewNode(ele_y_repr()) - ->assert_is_op_input("elementwise_add", "Y"); - - auto *ele_add = - pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); - - auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) - ->assert_is_op_output("elementwise_add", "Out"); - - ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); - - auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); - - auto *act_out_var = - pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); - - ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); - act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); - - return act_out_var; -} - PDNode *patterns::ElewiseAddActInplaceGrad::operator()( paddle::framework::ir::PDNode *d_act_out_var, std::unordered_set act_types) { @@ -1526,6 +1531,159 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( return ele_add_grad; } +PDNode *patterns::ElewiseAddAct::operator()( + paddle::framework::ir::PDNode *ele_x_var, + std::unordered_set act_types) { + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::LinearAct::operator()( + paddle::framework::ir::PDNode *linear_x_var, + const std::unordered_set &act_types, bool with_grad_link, + bool is_act_grad_x_from_act) { + auto *matmul_w_var = + pattern->NewNode(matmul_w_repr())->assert_is_op_input("matmul_v2", "Y"); + + auto *matmul = pattern->NewNode(matmul_repr())->assert_is_op("matmul_v2"); + + auto *matmul_out_var = pattern->NewNode(matmul_out_repr()) + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add", "X"); + + auto *ele_bias_var = pattern->NewNode(ele_bias_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + matmul->LinksFrom({linear_x_var, matmul_w_var}).LinksTo({matmul_out_var}); + ele_add->LinksFrom({matmul_out_var, ele_bias_var}).LinksTo({ele_out_var}); + + if (with_grad_link) { + matmul_out_var->assert_is_op_input("elementwise_add_grad", "X"); + auto *elementwise_add_grad_op = pattern->NewNode("elementwise_add_grad") + ->assert_is_op("elementwise_add_grad"); + elementwise_add_grad_op->LinksFrom({matmul_out_var}); + } + + if (act_types.size() > 0) { + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_ops_output(act_types, "Out"); + + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + if (with_grad_link && !is_act_grad_x_from_act) { + std::unordered_set act_grad_types; + for (const auto &act : act_types) { + std::string act_grad(act); + act_grad.append("_grad"); + act_grad_types.insert(act_grad); + } + + ele_out_var->assert_is_ops_input(act_grad_types, "X"); + auto *act_grad_op = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + act_grad_op->LinksFrom({ele_out_var}); + } + + return act_out_var; + } + + return ele_out_var; +} + +PDNode *patterns::ElewiseAddMatmulAct::operator()( + paddle::framework::ir::PDNode *dout_var, + const std::unordered_set &act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act) { + auto *ele_grad_bias_var = + pattern->NewNode(ele_grad_bias_repr()) + ->assert_is_op_input("elementwise_add_grad", "Y"); + auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr()) + ->assert_is_op("elementwise_add_grad"); + auto *ele_grad_dx_var = + pattern->NewNode(ele_grad_dx_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("X")); + auto *ele_grad_dbias_var = + pattern->NewNode(ele_grad_dbias_repr()) + ->assert_is_op_output("elementwise_add_grad", GradVarName("Y")); + ele_add_grad->LinksFrom({dout_var, ele_grad_bias_var}) + .LinksTo({ele_grad_dx_var, ele_grad_dbias_var}); + + ele_grad_dx_var->AsIntermediate()->assert_is_op_input("matmul_v2_grad", + GradVarName("Out")); + + auto *matmul_grad_x_var = pattern->NewNode(matmul_grad_x_repr()) + ->assert_is_op_input("matmul_v2_grad", "X"); + auto *matmul_grad_w_var = pattern->NewNode(matmul_grad_w_repr()) + ->assert_is_op_input("matmul_v2_grad", "Y"); + auto *matmul_grad = + pattern->NewNode(matmul_grad_repr())->assert_is_op("matmul_v2_grad"); + auto *matmul_grad_dx_var = + pattern->NewNode(matmul_grad_dx_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("X")); + auto *matmul_grad_dw_var = + pattern->NewNode(matmul_grad_dw_repr()) + ->assert_is_op_output("matmul_v2_grad", GradVarName("Y")); + matmul_grad->LinksFrom( + {ele_grad_dx_var, matmul_grad_x_var, matmul_grad_w_var}); + if (without_x_gradient) { + matmul_grad->LinksTo({matmul_grad_dw_var}); + } else { + matmul_grad->LinksTo({matmul_grad_dx_var, matmul_grad_dw_var}); + } + + if (!without_x_gradient && act_grad_types.size() > 0) { + matmul_grad_dx_var->AsIntermediate()->assert_is_ops_input( + act_grad_types, GradVarName("Out")); + + auto *act_grad = + pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types); + auto *act_grad_dx_var = + pattern->NewNode(act_grad_dx_repr()) + ->assert_is_ops_output(act_grad_types, GradVarName("X")); + + auto *act_grad_x_var = matmul_grad_x_var; + if (!is_act_grad_x_from_act) { + auto *ele_out_var = pattern->NewNode(ele_out_repr()) + ->assert_is_ops_input(act_grad_types, "X"); + act_grad_x_var = ele_out_var; + } + + act_grad->LinksFrom({matmul_grad_dx_var, act_grad_x_var}) + .LinksTo({act_grad_dx_var}); + return act_grad; + } + + return matmul_grad; +} + // conv_type: conv2d, conv3d, conv2d_transpose PDNode *patterns::ConvBias::operator()( paddle::framework::ir::PDNode *conv_input, std::string conv_type) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d6400ed6945bf8a60c1d4f357bf58a11d5b87094..062d2f9dedce65f6e16b70f0b201a4ca63b0531a 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -487,6 +487,28 @@ struct ConvActivation : public PatternBase { PATTERN_DECL_NODE(activation_out); }; +// Elementwise with Activation +// op: elementwise + activation +// named nodes: +// elementwise_a, elementwise_b, +// elementwise_out, elementwise, +// activation_out, activation +struct ElementwiseActivation : public PatternBase { + ElementwiseActivation(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise_add_activation") {} + + PDNode* operator()(PDNode* elementwise_a, const std::string& elementwise_type, + const std::string& activation_type); + + // declare operator node's name + PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(activation); + // declare variable node's name + PATTERN_DECL_NODE(elementwise_b); + PATTERN_DECL_NODE(elementwise_out); + PATTERN_DECL_NODE(activation_out); +}; + // SEQCONV with Elementwise_Add ReLU // op: seqconv + elementwise_add + relu // named nodes: @@ -863,6 +885,65 @@ struct ElewiseAddActInplaceGrad : public PatternBase { PATTERN_DECL_NODE(ele_y); }; +// The following patterns are used to fuse linear and act (ReLu or GeLU) +// formula: act(F.linear(x)) +// op: matmul_v2 + elementwise_add + act +// named nodes: matmul, elementwise_add, act +// matmul_w, matmul_out +// ele_bias, elewise_add_out, act_out +struct LinearAct : public PatternBase { + LinearAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "linear_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_types, + bool with_grad_link, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(matmul); + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(matmul_w); + PATTERN_DECL_NODE(matmul_out); + PATTERN_DECL_NODE(elewise_add_out); + PATTERN_DECL_NODE(ele_bias); + PATTERN_DECL_NODE(act_out); +}; + +// The following patterns are used to fuse linear_grad and act_grad (ReLu or +// GeLU) +// formula: the backward of F.linear( act(x) ) +// op: elementwise_add_grad + matmul_v2_grad + act_grad +// named nodes: ele_add_grad, matmul_grad, act_grad +// ele_grad_bias, ele_grad_dx, ele_grad_dbias +// matmul_grad_x, matmul_grad_dx, matmul_grad_dx +// matmul_grad_dw, act_grad_dx +struct ElewiseAddMatmulAct : public PatternBase { + ElewiseAddMatmulAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewiseadd_matmul_act") {} + + PDNode* operator()(PDNode* x, + const std::unordered_set& act_grad_types, + bool without_x_gradient, bool is_act_grad_x_from_act); + + // declare operator node's name + PATTERN_DECL_NODE(ele_add_grad); + PATTERN_DECL_NODE(matmul_grad); + PATTERN_DECL_NODE(act_grad); + // declare variable node's name + PATTERN_DECL_NODE(ele_out); + PATTERN_DECL_NODE(ele_grad_bias); + PATTERN_DECL_NODE(ele_grad_dx); + PATTERN_DECL_NODE(ele_grad_dbias); + PATTERN_DECL_NODE(matmul_grad_x); + PATTERN_DECL_NODE(matmul_grad_w); + PATTERN_DECL_NODE(matmul_grad_dx); + PATTERN_DECL_NODE(matmul_grad_dw); + PATTERN_DECL_NODE(act_grad_dx); +}; + // Conv with Elementwise_add as bias // op: conv + elementwise_add // named nodes: diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index d33dc7f49feb0f4c9e585d13186d65b6c2d618c0..636a594a657cb0744aac161d928ff9078b1f92bc 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -20,12 +20,15 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(scale); USE_OP(elementwise_mul); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT); + DECLARE_double(eager_delete_tensor_gb); namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index c537d05738529dcb885e86cbcabf4405fd75270b..2403e60df3918394e99c3284b2a417e336fc3bae 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { @@ -135,157 +136,9 @@ ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { .End(); } -ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::IdentityConvFunc& - get_node_from_conv_op, - const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_op{get_node_from_conv_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_op; - Node* conv_input; - Node* conv_filter; - Node* conv_output; - - Node* elementwise_add_op; - Node* elementwise_add_identity; - Node* elementwise_add_out; - - std::tie(conv_op, conv_input, conv_filter, conv_output) = - get_node_from_conv_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_op, elementwise_add_op)) return; - - if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; - - if (HasFusedActivation(conv_op)) return; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } - - conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); - conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - conv_op->Op()->SetAttr("fuse_residual_connection", true); - - GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op}); - - IR_NODE_LINK_TO(elementwise_add_identity, conv_op); - IR_NODE_LINK_TO(conv_op, elementwise_add_out); - - (*fusion_stats)++; -} - -ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle( - const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_x_op, - const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& - get_node_from_conv_y_op, - const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc& - get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass) - : fusion_stats{std::make_shared(0)}, - can_fuse_func{can_fuse_func}, - get_node_from_conv_x_op{get_node_from_conv_x_op}, - get_node_from_conv_y_op{get_node_from_conv_y_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, - pass_{pass} {} - -void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( - const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - Node* conv_x_op; - Node* conv_x_input; - Node* conv_x_filter; - Node* conv_x_output; - - Node* conv_y_op; - Node* conv_y_input; - Node* conv_y_filter; - Node* conv_y_output; - - Node* elementwise_add_op; - Node* elementwise_add_out; - - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } - - std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) = - get_node_from_conv_x_op(subgraph); - std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) = - get_node_from_conv_y_op(subgraph); - std::tie(elementwise_add_op, elementwise_add_out) = - get_node_from_elementwise_add_op(subgraph); - - if (!can_fuse_func(conv_x_op, elementwise_add_op)) return; - if (!can_fuse_func(conv_y_op, elementwise_add_op)) return; - - Node* projection_node; - Node* residual_conv_op; - Node* residual_conv_output; - - if (IsReachable(graph, conv_x_input, conv_y_output)) { - projection_node = conv_x_output; - residual_conv_op = conv_y_op; - residual_conv_output = conv_y_output; - } else if (IsReachable(graph, conv_y_input, conv_x_output)) { - projection_node = conv_y_output; - residual_conv_op = conv_x_op; - residual_conv_output = conv_x_output; - } else { - return; - } - - if (HasFusedActivation(residual_conv_op)) return; - - residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); - residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); - - residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); - - GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op}); - - IR_NODE_LINK_TO(projection_node, residual_conv_op); - IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); - - (*fusion_stats)++; -} - -std::tuple -ResidualConnectionMKLDNNFusePass::GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const { - GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); - GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); - - return std::make_tuple(conv_op, conv_input, conv_filter, conv_output); -} - GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( const std::string& name_scope, const GraphWithStats& graph_with_stats) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); @@ -298,26 +151,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); conv_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_y, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_conv_as_x_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_identity, elementwise_add_y, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + if (!IsReachable(g, elementwise_add_identity, conv_output)) return; + + if (HasFusedActivation(conv_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(elementwise_add_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); + + found_conv_as_x_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_x_count + << " conv (as x) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_conv_as_x_count + graph_with_stats.second); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( @@ -335,26 +218,56 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( conv_output); conv_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_x, - elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_conv_as_y_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (FindFuseOption(*conv_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + if (!IsReachable(g, elementwise_add_x, conv_output)) return; + + if (HasFusedActivation(conv_op)) return; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + conv_op->Op()->SetInput("ResidualData", {elementwise_add_x->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(elementwise_add_x, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); + + found_conv_as_y_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_conv_as_y_count + << " conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_conv_as_y_count + graph_with_stats.second); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( @@ -374,39 +287,84 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( conv_x_output->AsIntermediate(); conv_y_output->AsIntermediate(); - auto get_node_from_elementwise_add = [&elementwise_add_pattern]( - const GraphPatternDetector::subgraph_t& subgraph) - -> std::tuple { - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, - elementwise_add_pattern); - GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, - elementwise_add_pattern); - - return std::make_tuple(elementwise_add_op, elementwise_add_out); - }; - - return ExecuteHandleOnGraph( - &gpd, graph_with_stats, - [this, - &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_x_pattern, subgraph); - }, - [this, - &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) { - return GetNodesFromConv(conv_y_pattern, subgraph); - }, - get_node_from_elementwise_add, this); + int found_projection_conv_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(conv_x_op, conv_op, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_input, conv_input, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_filter, conv_filter, conv_x_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_x_output, conv_output, conv_x_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(conv_y_op, conv_op, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_input, conv_input, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_filter, conv_filter, conv_y_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_y_output, conv_output, conv_y_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + + if (FindFuseOption(*conv_x_op, *elementwise_add_op) != FUSE_MKLDNN) return; + if (FindFuseOption(*conv_y_op, *elementwise_add_op) != FUSE_MKLDNN) return; + + Node* projection_node; + Node* residual_conv_op; + Node* residual_conv_output; + if (IsReachable(g, conv_x_input, conv_y_output)) { + projection_node = conv_x_output; + residual_conv_op = conv_y_op; + residual_conv_output = conv_y_output; + } else if (IsReachable(g, conv_y_input, conv_x_output)) { + projection_node = conv_y_output; + residual_conv_op = conv_x_op; + residual_conv_output = conv_x_output; + } else { + return; + } + + if (HasFusedActivation(residual_conv_op)) return; + + residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + + residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(g, {residual_conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(projection_node, residual_conv_op); + IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + + found_projection_conv_count++; + }; + + gpd(graph_with_stats.first, handler); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_projection_conv_count + << " projection conv (as y) + elementwise_add patterns"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } + + return std::make_pair(graph_with_stats.first, + found_projection_conv_count + graph_with_stats.second); } -void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { +void ResidualConnectionMKLDNNFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init(name_scope_, graph); - auto fused_graph_with_stats = FuseConvAsY( - name_scope_, - FuseConvAsX(name_scope_, - FuseProjectionConv(name_scope_, std::make_pair(graph, 0)))); + auto graph_with_stats = + FuseProjectionConv(name_scope_, std::make_pair(graph, 0)); + graph_with_stats = FuseConvAsX(name_scope_, graph_with_stats); + graph_with_stats = FuseConvAsY(name_scope_, graph_with_stats); - LOG(INFO) << "Fused graph " << fused_graph_with_stats.second << "\n"; - AddStatis(fused_graph_with_stats.second); + AddStatis(graph_with_stats.second); } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index c83335da2f629c128fcf4819b2645ab1ef5eae42..c4351b382187d9062a059d013ddb237520645b6d 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -28,19 +28,9 @@ namespace paddle { namespace framework { namespace ir { -class Graph; -class GraphPatternDetector; -class Node; -namespace patterns { -struct Conv; -} // namespace patterns - -using graph_ptr = ir::Graph*; using GraphWithStats = std::pair; -void CorrectGraphEdges(Graph* graph, Node* from, Node* to); bool IsReachable(ir::Graph* graph, Node* from, Node* to); -paddle::optional HasBias(const Node& op, const std::string& bias_name); class ResidualConnectionMKLDNNFusePass : public FusePassBase { private: @@ -52,91 +42,13 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { const std::string& name_scope, const GraphWithStats& graph_with_stats) const; - template - using GetNodeFunc = - std::function; - using IdentityConvFunc = GetNodeFunc>; - using IdentityElementwiseAddFunc = - GetNodeFunc>; - - using ProjectionConvFunc = IdentityConvFunc; - using ProjectionElementwiseAddFunc = GetNodeFunc>; - - using CanFuseFunc = std::function; - - std::tuple GetNodesFromConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - std::tuple GetNodesFromProjectionConv( - const patterns::Conv& conv_pattern, - const GraphPatternDetector::subgraph_t& subgraph) const; - - template - GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd, - const GraphWithStats& graph_with_stats, - OpFuncs&&... op_funcs) const { - ir::Graph* graph; - int stats; - - std::tie(graph, stats) = graph_with_stats; - - auto can_fuse = [this](Node* op1, Node* op2) -> bool { - return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN; - }; - auto fuse_handle = HandleType{can_fuse, std::forward(op_funcs)...}; - - (*gpd)(graph, fuse_handle); - - return std::make_pair(graph, stats + fuse_handle.get_stats()); - } - - struct IdentityFuseHandle { - IdentityFuseHandle( - const CanFuseFunc& can_fuse_func, - const IdentityConvFunc& get_node_from_conv_op, - const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - IdentityConvFunc get_node_from_conv_op; - IdentityElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - - struct ProjectionFuseHandle { - ProjectionFuseHandle( - const CanFuseFunc& can_fuse_func, - const ProjectionConvFunc& get_node_from_conv_x_op, - const ProjectionConvFunc& get_node_from_conv_y_op, - const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op, - const ResidualConnectionMKLDNNFusePass* pass); - - void operator()(const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph); - int get_stats() const { return *fusion_stats; } - - private: - std::shared_ptr fusion_stats; - CanFuseFunc can_fuse_func; - ProjectionConvFunc get_node_from_conv_x_op; - ProjectionConvFunc get_node_from_conv_y_op; - ProjectionElementwiseAddFunc get_node_from_elementwise_add_op; - const ResidualConnectionMKLDNNFusePass* pass_; - }; - public: ResidualConnectionMKLDNNFusePass(); virtual ~ResidualConnectionMKLDNNFusePass() {} protected: - void ApplyImpl(graph_ptr graph) const; + void ApplyImpl(ir::Graph* graph) const; + static bool HasFusedActivation(Node* conv_node) { return !(conv_node->Op() ->GetAttrIfExists("fuse_activation") diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b7f7a8071d21413f45d86e98b8649a3aaba5d2f5 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void ElementwiseActivationOneDNNPass::ApplyImpl(Graph *graph) const { + std::vector act_types = { + "relu", "tanh", "leaky_relu", "swish", "hardswish", "sqrt", + "abs", "clip", "gelu", "relu6", "sigmoid"}; + std::vector elt_types = {"elementwise_add", "elementwise_sub", + "elementwise_mul"}; + + for (const auto &elt_type : elt_types) + for (const auto &act_type : act_types) { + std::unordered_map attr_map; + + if (act_type == "swish") + attr_map.emplace("beta", "activation_alpha"); + else if (act_type == "relu6") + attr_map.emplace("threshold", "activation_alpha"); + else if (act_type == "clip") { + attr_map.emplace("min", "activation_alpha"); + attr_map.emplace("max", "activation_beta"); + } else { + attr_map.emplace("alpha", "activation_alpha"); + attr_map.emplace("beta", "activation_beta"); + } + FuseElementwiseAct(graph, elt_type, act_type, attr_map); + } +} + +void ElementwiseActivationOneDNNPass::FuseElementwiseAct( + Graph *graph, const std::string &elt_type, const std::string &act_type, + const std::unordered_map &attr_map) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init("elementwise_act", graph); + + GraphPatternDetector gpd; + auto *elementwise_input = gpd.mutable_pattern() + ->NewNode(elt_type + "_act/elementwise_input") + ->AsInput() + ->assert_is_op_input(elt_type, "X"); + patterns::ElementwiseActivation elementwise_act_pattern(gpd.mutable_pattern(), + elt_type + "_act"); + elementwise_act_pattern(elementwise_input, elt_type, act_type); + + int found_elementwise_activation_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "Fuse " << elt_type << " with activation op."; + // Elementwise output + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, + elementwise_act_pattern); + // ACT output + GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, + elementwise_act_pattern); + // ops + GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, + elementwise_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(activation, activation, elementwise_act_pattern); + + auto *elementwise_op = elementwise->Op(); + + if (elementwise_op->HasAttr("use_mkldnn")) { + const std::string wo_elt_type = + "The " + elt_type; // Workaround for PP error message checking. + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(bool, elementwise_op->GetAttr("use_mkldnn")), true, + platform::errors::PreconditionNotMet( + wo_elt_type + "+Act fusion may happen only when oneDNN library " + "is used.")); + } + + auto *activation_op = activation->Op(); + for (const auto &attr : attr_map) { + if (activation_op->HasAttr(attr.first)) { + elementwise_op->SetAttr(attr.second, + activation_op->GetAttr(attr.first)); + } + } + + if (act_type == "gelu" && activation_op->HasAttr("approximate") && + BOOST_GET_CONST(bool, activation_op->GetAttr("approximate"))) + elementwise_op->SetAttr("activation_type", std::string("gelu_tanh")); + else + elementwise_op->SetAttr("activation_type", act_type); + + elementwise_op->SetOutput("Out", {activation_out->Name()}); + + IR_OP_VAR_LINK(elementwise, activation_out); + GraphSafeRemoveNodes(g, {activation, elementwise_out}); + found_elementwise_activation_count++; + }; + + gpd(graph, handler); + AddStatis(found_elementwise_activation_count); + PrettyLogDetail("--- fused %d %s with %s activation", + found_elementwise_activation_count, elt_type, act_type); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(elt_act_mkldnn_fuse_pass, + paddle::framework::ir::ElementwiseActivationOneDNNPass); +REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .LE("elementwise_sub", 1) + .LE("elementwise_mul", 1) + .LE("relu", 0) + .LE("tanh", 0) + .LE("leaky_relu", 1) + .LE("swish", 0) + .LE("hard_swish", 0) + .LE("sqrt", 0) + .LE("abs", 0) + .LE("clip", 1) + .LE("gelu", 0) + .LE("relu6", 0) + .LE("sigmoid", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b8b7d06a828508e9773301bfc602e01f9354eac4 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * \brief Fuse the Elementwise and activation operators into single + * OneDNN's Elementwise with post-op. + */ +class ElementwiseActivationOneDNNPass : public FusePassBase { + public: + virtual ~ElementwiseActivationOneDNNPass() {} + + protected: + void ApplyImpl(Graph *graph) const override; + + void FuseElementwiseAct( + Graph *graph, const std::string &elt_types, const std::string &act_types, + const std::unordered_map &attr_map) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 96aa95bde337436dd6eb584b3eea5395b5301a34..11190309814e7c75777a6cddd7e4d24bfc7ba9e6 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include -#include #include -#include -#include + +#include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" @@ -25,7 +26,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/place.h" -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 0a95444f852dd0abdd150d04dc7536e26151c218..ef2e83ced26e07f199a122ee3157eb428b63aec9 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -15,8 +15,9 @@ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include -#include #include + +#include #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,11 +25,11 @@ USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); -USE_OP(relu); -USE_OP(tanh); +USE_OP_ITSELF(relu); +USE_OP_ITSELF(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); namespace paddle { diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 2c3359ffa8e46f0d30a01d73fccb95d88771480a..eadb00b9e88e14075c46a53c711fd43774f26581 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -32,12 +32,12 @@ USE_OP(concat); USE_OP(matmul); USE_OP_ITSELF(elementwise_add); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); @@ -46,15 +46,15 @@ USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); -USE_OP(elementwise_mul_grad); +USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); -USE_OP(tanh_grad); +USE_OP_ITSELF(tanh_grad); USE_OP(sum); USE_OP(slice_grad); USE_OP(lookup_table_grad); USE_OP(sqrt); USE_OP(elementwise_max); -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); USE_OP(sgd); USE_OP(squared_l2_norm); USE_OP(memcpy_h2d); diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h index 7b3916bafc93eda8cb1afbf54b706e032c5233dd..bc65231abe7371a931f709c9190b55fde24f0543 100644 --- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h @@ -409,7 +409,7 @@ class ThreadPoolTempl { return false; } platform::RecordEvent("SleepWaitForWork", - platform::TracerEventType::UserDefined, 2); + platform::TracerEventType::UserDefined, 10); ec_.CommitWait(waiter); blocked_--; return true; diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index c45bf32d8b710cb35ec5f86a4a8ba2e1078537e6..eb40a49b4066a7a8c8e9c142a310b815fd73da20 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -286,8 +286,8 @@ struct OpKernelRegistrarFunctorEx, \ paddle::framework::EmptyGradOpMaker) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6414dd455db4f2e39d958760449e3eb9d7d362f0..f23a266ef03641bc8f8d273b15ab4982e377cb03 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -254,7 +254,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { "reinstall Paddle with CustomDevice support.", place)); #else - platform::DeviceManager::SetDevice(place); + phi::DeviceManager::SetDevice(place); #endif } @@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const { return var != nullptr; } +bool ExecutionContext::HasInputs(const std::string& name) const { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { + return false; + } + for (const auto* input : it->second) { + if (input == nullptr) { + return false; + } + } + return true; +} + bool ExecutionContext::HasOutput(const std::string& name) const { auto* var = OutputVar(name); return var != nullptr; @@ -2106,15 +2120,19 @@ void OperatorWithKernel::BuildPhiKernelContext( for (size_t offset = 0; offset < outs_vector.size(); ++offset) { phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]; - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); + + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } } + pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } @@ -2185,41 +2203,109 @@ void OperatorWithKernel::BuildPhiKernelContext( std::move(experimental::MakePhiScalarFromVar(*ins_vector.front()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = Attrs().at(attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later - auto& attr = Attrs().at(attr_names[i]); + auto attr_it = attrs_.find(attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); + if (attr_it == attrs_.end()) { + auto in_it = ctx.inputs.find(attr_names[i]); + if (in_it != ctx.inputs.end()) { + // get data from input + auto val = experimental::MakePhiScalarFromVar(*(in_it->second[0])); + int32_t val_int = val.template to(); + pt_kernel_context->EmplaceBackAttr(val_int); + } else { + PADDLE_THROW(platform::errors::NotFound( + "can not find attribute `%s` both in attribute and input ", + attr_names[i])); + } + } else { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int, attr_it->second)); + } } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(float, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(float, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(bool, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(int64_t, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { - pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::string, attr_it->second)); } else if (attr_defs[i].type_index == std::type_index(typeid(phi::DataType))) { auto data_type = paddle::framework::TransToPhiDataType( static_cast( - BOOST_GET_CONST(int, attr))); + BOOST_GET_CONST(int, attr_it->second))); pt_kernel_context->EmplaceBackAttr(data_type); } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + if (std::type_index(attr_it->second.type()) == + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr_it->second)); + } else if (std::type_index(attr_it->second.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } - // TODO(YuanRisheng) Need support vector attr - } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const auto& vector_int_attr = + BOOST_GET_CONST(std::vector, attr_it->second); pt_kernel_context->EmplaceBackAttr(vector_int_attr); } else { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e33d4feb82a9e7a92c3dabea0ccc5fe370afda66..1a1171f1dba4d794796ef1421fe386f60a0e587d 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -295,6 +295,8 @@ class ExecutionContext { virtual bool HasInput(const std::string& name) const; + virtual bool HasInputs(const std::string& name) const; + virtual bool HasOutput(const std::string& name) const; virtual size_t InputSize(const std::string& name) const { @@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { : ctx_(ctx) {} bool HasInput(const std::string& name) const override { - return ctx_.HasInput(name); + return ctx_.HasInputs(name); } bool HasOutput(const std::string& name) const override { diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bf9d1baaf394f05d125563311dd2047383373834..47dffd47b7cbbf4a37e6715b40d41024330bc679 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); -USE_OP(relu_grad); +USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 706815185a1b5b53d1bb8e26274206abc126cfd5..c015e90f71e54691e92c3a36c3d6e053372f64f3 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -241,7 +241,6 @@ std::unique_ptr CinnCompiler::CompileGraph( std::make_unique(target, scope, cinn_graph); GraphCompiler::CompileOptions options; options.with_instantiate_variables = false; - options.with_buffer_handle_instruction_inserted = true; auto compiled_res = graph_compiler->Build(options, std::move(fetch_ids), stream); auto compiled_obj = std::make_unique(); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index e8badab27b9b97aade81bf496ce211485f924757..cdccc4c5546900a141a084281f419c2940b23817 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) { USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc index 23cb653fef22ac966655e5650d20c128e2bd3cdd..7a7a7b2798f5920f89e15222959a935da9af2c25 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc @@ -45,8 +45,8 @@ Program CreateAddProgram() { NetBuilder builder("net_builder"); auto a = builder.CreateInput(Float(32), {M, N}); auto b = builder.CreateInput(Float(32), {M, N}); - auto c = builder.add(a, b); - auto d = builder.add(a, c); + auto c = builder.Add(a, b); + auto d = builder.Add(a, c); auto program = builder.Build(); return program; @@ -116,8 +116,8 @@ TEST(net_build, program_execute_fc) { auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight auto b = builder.CreateInput(Float(32), {N}, "B"); // bias - auto mul_out = builder.mul(a, w, 2, 1); - auto add_out = builder.add(mul_out, b); + auto mul_out = builder.Mul(a, w, 2, 1); + auto add_out = builder.Add(mul_out, b); auto program = builder.Build(); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index e1ce705533ab4ba1c75d8f656683608365e97907..3d8a5ab21f00fcc4137d177b741023a827e325d7 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -33,6 +33,7 @@ if(NOT WIN32) endif() if(WITH_CNCL) cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) + cc_library(reducer SRCS reducer.cc DEPS layer) endif() if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) @@ -41,7 +42,7 @@ if(NOT WIN32) endif(NOT WIN32) if(WITH_GLOO) cc_library(imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits) - if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) )) + if ( WIN32 OR (NOT (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL OR WITH_CNCL) )) cc_library(reducer SRCS reducer.cc DEPS layer) endif() endif() diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 7416d206fc43eaf5a56c3eb606bb0672d1172c0b..d7478b18dba0616fdc995866d8892c7c052a0e35 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -389,6 +389,9 @@ static void PerformBackwardInplace(const std::string& op_type, } void BasicEngine::Execute() { + platform::RecordEvent backward_record_event( + "backward", platform::TracerEventType::Operator, 1); + if (init_nodes_.empty()) { return; } @@ -412,7 +415,7 @@ void BasicEngine::Execute() { for (auto& cur_op : *shared_cur_node) { platform::RecordEvent op_type_record_event( - cur_op.Type(), platform::TracerEventType::Operator, 1); + cur_op.Type() + " grad_node", platform::TracerEventType::Operator, 1); ++op_num; diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index fe5ac73b0046915c4a52087ed792925b0b0ed200..fbc47f81fd33169f54aeb2c251f9b6c90cb44637 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext { return (it != var_map_in_.end() && it->second.size() > 0); } + bool HasInputs(const std::string& name) const override { + auto it = var_map_in_.find(name); + return (it != var_map_in_.end() && it->second.size() > 0); + } + bool HasOutput(const std::string& name) const override { auto it = var_map_out_.find(name); return (it != var_map_out_.end() && it->second.size() > 0); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 2317bfdd7c0d5ee94e91e081da47177625f5bfd8..bae49fb381a475dd8227d1dc855a6db28c9cd273 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -247,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif #ifdef PADDLE_WITH_XPU_KP + expected_kernel_key.place_ = platform::XPUPlace(); bool use_xpu_kp_kernel_rt = FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 3b5762720e7fb4a9eb0be157f6dabf07aa9353c2..8deb3b93e9c50489dcfc6805063f23e3705cb634 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -264,14 +264,23 @@ void BuildDygraphPhiKernelContext( size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); - if ((it == ins.end()) && - (input_defs[i].type_index == - std::type_index(typeid(paddle::optional)))) { - kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); - auto end_idx = start_idx + 1; - kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); - continue; + if (it == ins.end()) { + if (LIKELY(input_defs[i].type_index == + std::type_index( + typeid(paddle::optional)))) { + kernel_ctx->EmplaceBackInputWithoutSetRange(nullptr); + auto end_idx = start_idx + 1; + kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i); + continue; + } else { + PADDLE_THROW(phi::errors::NotFound( + "Can not find input variable '%s' for %s OP, please check whether " + "the name setting in OpArgumentMapping is consistent with that in " + "OpMaker.", + input_names[i], pt_kernel_signature.name)); + } } + auto ins_vector = it->second; size_t end_idx = start_idx + ins_vector.size(); @@ -314,21 +323,25 @@ void BuildDygraphPhiKernelContext( phi::TensorBase* tensor_out = nullptr; auto* var = outs_vector[offset]->MutableVar(); - if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else if (var->template IsType()) { - tensor_out = var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported output `%s` type when call pt kernel.", - framework::ToTypeName(var->Type()))); + if (var) { + if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else if (var->template IsType()) { + tensor_out = var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported output `%s` type when call pt kernel.", + framework::ToTypeName(var->Type()))); + } } + kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i); } for (size_t i = 0; i < attr_names.size(); ++i) { + VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute @@ -406,8 +419,74 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (ins.find(attr_names[i]) != ins.end()) { + // deal tensor attr here + auto& ins_vector = ins.at(attr_names[i]); + auto tensor_attr = + experimental::MakePhiScalarFromVar(ins_vector[0]->Var()); + if (attr_defs[i].type_index == std::type_index(typeid(int))) { + int val = tensor_attr.template to(); + kernel_ctx->EmplaceBackAttr(val); + } else { + PADDLE_THROW(platform::errors::Unimplemented("only support int here")); + } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); if (attr_defs[i].type_index == std::type_index(typeid(int))) { kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(int, attr)); @@ -429,7 +508,11 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 3a6365b2af21ae9012fe37293699caed9bb23855..fec9afbf3b403ca2fd45633326c7f7dec46e1243 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -31,7 +31,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) // div the nranks void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { framework::Tensor *tensor = @@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) { #ifdef PADDLE_WITH_XPU_BKCL // TODO(liuyuhui) support xpu about div nranks in the future #endif + } else if (platform::is_mlu_place(tensor->place())) { + // TODO(zhangna) + VLOG(4) << "divnrank for mlu not support yet"; } } @@ -222,6 +225,56 @@ void SplitTensorsWithType( } #endif +#ifdef PADDLE_WITH_CNCL +// context is used to select the stream for concat +template <> +void ConcatTensorsWithType( + const platform::MLUDeviceContext &context, + const std::vector &dense_tensors_, + framework::Variable *p_dense_contents, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + case framework::proto::VarType::FP32: + ConcatTensorsForAllReduce( + context, dense_tensors_, p_dense_contents); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it concats tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} + +// context is used to select the stream for split +template <> +void SplitTensorsWithType( + const platform::MLUDeviceContext &context, + framework::Variable *p_dense_contents, + std::vector *p_dense_tensors, + framework::proto::VarType::Type type) { + switch (type) { + case framework::proto::VarType::FP16: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + case framework::proto::VarType::FP32: + SplitTensorsForAllReduce( + context, p_dense_contents, p_dense_tensors); + break; + default: + PADDLE_THROW(platform::errors::Unimplemented( + "Data type (%s) is not supported when it splits tensors for " + "allreduce.", + framework::DataTypeToString(type))); + } +} +#endif + void Group::ConcatTensors(const platform::DeviceContext &context) { auto place = context.GetPlace(); if (platform::is_gpu_place(place)) { @@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't concat npu grads since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + ConcatTensorsWithType( + static_cast(context), + dense_tensors_, &dense_contents_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't concat mlu grads since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { ConcatTensorsWithType( @@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) { PADDLE_THROW(platform::errors::PermissionDenied( "Paddle can't split npu grad since it's not compiled with HCCL," "Please recompile or reinstall Paddle with HCCL support.")); +#endif + } else if (platform::is_mlu_place(place)) { +#ifdef PADDLE_WITH_CNCL + SplitTensorsWithType( + static_cast(context), + &dense_contents_, &dense_tensors_, dtype_); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't split mlu grad since it's not compiled with CNCL," + "Please recompile or reinstall Paddle with CNCL support.")); #endif } else if (platform::is_cpu_place(place)) { SplitTensorsWithType( @@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui) support XPU set constant VLOG(3) << "XPU doesn't support set_constant"; } +#elif defined(PADDLE_WITH_CNCL) + if (platform::is_mlu_place(group_tensor.place())) { + // TODO(liuyuhui) support MLU set constant + VLOG(3) << "MLU doesn't support set_constant"; + } #else auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); if (HasGrad(var_index)) { @@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) { cv_.notify_all(); } }); -#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ - defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) +#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \ + defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \ + defined(PADDLE_WITH_CNCL) FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "Not compiled with BKCL or NCCL or GLOO.")); + "Not compiled with BKCL or NCCL or CNCL or GLOO.")); #endif } } diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index cca773b840c279f05cd6bcd0ed82fda7fdd55a25..9fac4b41cbde01f365dcc603844b06c473a58843 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -45,7 +45,7 @@ namespace imperative { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \ - defined(PADDLE_WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_CNCL) template struct DivNRanksFunctor { diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index e4f1cfdb3baeed9b5945b7843b6593528df48c29..09de0106ed6190c5f627ba9fb7cc038593b5088a 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy) cc_test(test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op) -if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL) +if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_CNCL) cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy) endif() diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc index 6c304278d21fde7af093b25cdd8f62a1d4528d31..5e674af1a08a87c11bfab1080be42e623661b38e 100644 --- a/paddle/fluid/imperative/tests/test_group.cc +++ b/paddle/fluid/imperative/tests/test_group.cc @@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) { value.push_back(static_cast(1.0 * j)); } - if (std::is_same::value) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (std::is_same::value || + std::is_same::value) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_CNCL) paddle::memory::Copy(place, data, cpu_place, value.data(), sizeof(T) * value.size(), 0); #endif @@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) { } #endif +#if defined(PADDLE_WITH_CNCL) +TEST(TestGroup, TestMLUConcatSplit) { + platform::MLUPlace mlu_place(0); + platform::CPUPlace cpu_place; + + int size = 3; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); + + size = 15; + GroupConcatSplit(cpu_place, size); + GroupConcatSplit(mlu_place, size); +} +#endif } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 3ac2028790608529e0745dde2ce41ed57748f46d..02a1689c23a3fe5e1543a2e52d7661d5997bc062 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -24,6 +24,10 @@ #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace platform = paddle::platform; namespace framework = paddle::framework; diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index f5ca13cb99ad3df6b9283565b5681c36f7197ae8..4cda3f32fdf3fdd2d14b201fa902c1f50f3ff98d 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -24,6 +24,13 @@ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -226,7 +233,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { } // namespace paddle USE_OP_ITSELF(split); -USE_OP(relu); +USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN USE_OP_DEVICE_KERNEL(relu, MKLDNN); #endif diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index d05036f7a12ebdc3db5fbfda5eb50c295c0478e4..f754c6fdd0ee7742f0e544baad0225502c172848 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -28,6 +28,14 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -591,5 +599,5 @@ TEST(test_tracer, eager_tracer) { USE_OP(mul); USE_OP(mul_grad); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 85bcbd1458f24a592b646dfcda750f37f113f73f..c55599cc9aa954e2bd437f0917c792e4fdb6b577 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -18,12 +18,14 @@ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" +#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/common/place.h" DECLARE_bool(use_mkldnn); DECLARE_string(tracer_mkldnn_ops_on); @@ -175,7 +177,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, paddle::framework::AttributeMap* passed_default_attrs_, bool use_default_attr_map) { platform::RecordEvent op_type_record_event( - type, platform::TracerEventType::Operator, 1); + type + " trace_op", platform::TracerEventType::Operator, 1); platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { @@ -253,7 +255,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, #endif } else if (platform::is_custom_place(place)) { #ifdef PADDLE_WITH_CUSTOM_DEVICE - platform::DeviceManager::SetDevice(place); + phi::DeviceManager::SetDevice(place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with CustomDevice if use " @@ -295,19 +297,24 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, program_desc_tracer_->InsertOp(type, new_ins, outs, attrs); } - if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { - PADDLE_ENFORCE_EQ( - passed_default_attrs_, nullptr, - paddle::platform::errors::PermissionDenied( - "We expect passed_default_attrs_ is nullptr while " - "use_default_attr_map is true, however we got not null " - "passed_default_attrs_. Please check your usage of trace_op. ")); - CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, - inplace_map); - } else { - VLOG(3) << "No Grad to track for Op: " << type; + { + platform::RecordEvent node_creation_record_event( + type + " node_creation", platform::TracerEventType::Operator, 1); + + if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { + PADDLE_ENFORCE_EQ( + passed_default_attrs_, nullptr, + paddle::platform::errors::PermissionDenied( + "We expect passed_default_attrs_ is nullptr while " + "use_default_attr_map is true, however we got not null " + "passed_default_attrs_. Please check your usage of trace_op. ")); + CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, + inplace_map); + } else { + VLOG(3) << "No Grad to track for Op: " << type; + } + VLOG(6) << "Finish Trace Op: " << type; } - VLOG(6) << "Finish Trace Op: " << type; } template void Tracer::TraceOp( @@ -382,5 +389,36 @@ bool Tracer::ComputeRequiredGrad(const NameTensorMap& ins, return false; } +phi::KernelSignature Tracer::GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const { + auto op = framework::OpRegistry::CreateOp(type, {}, {}, {}, false); + framework::RuntimeContext ctx({}, {}); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(phi::CPUPlace()); + const auto& op_info = op->Info(); + auto* attr_checker = op_info.Checker(); + if (attr_checker) { + attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true); + } + static paddle::framework::AttributeMap empty_attrs_map = {}; + const paddle::framework::AttributeMap& default_attrs = + attr_checker == nullptr ? empty_attrs_map + : attr_checker->GetDefaultAttrMap(); + auto dygraph_exe_ctx = + imperative::DygraphExecutionContext( + *op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, + default_attrs); + auto* opbase_with_kernel = + dynamic_cast(op.get()); + PADDLE_ENFORCE_NE(opbase_with_kernel, nullptr, + platform::errors::InvalidArgument( + "This op type:`%s` is not a OperatorWithKernel, only " + "OperatorWithKernel can get KernelSignature", + type)); + return phi::KernelSignature( + std::move(opbase_with_kernel->GetExpectedPhiKernelArgs(dygraph_exe_ctx))); +} + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 73ecbbe6143ca8e68049c2d2886e9eee93b741f1..fd13fce6a6e17a47a7a91dfa78598a99ec22f0b7 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -28,6 +28,7 @@ #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/core/compat/arg_map_context.h" namespace paddle { namespace imperative { @@ -154,6 +155,10 @@ class Tracer { } } + phi::KernelSignature GetExpectedKernelSignature( + const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs) const; + paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 26b8b9e8e17e046964d648f564c26293036e4033..5d0c3c98d2f618eb1f3d41e6a4e2434e5cd80401 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -45,6 +45,11 @@ add_subdirectory(api) set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) + +if(WITH_ONNXRUNTIME) + set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor) +endif() + #TODO(wilber, T8T9): Do we still need to support windows gpu static library? if(WIN32 AND WITH_GPU) cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules}) @@ -91,6 +96,13 @@ if (WITH_PSCORE) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service) endif () +if (WITH_ONNXRUNTIME) + set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} + ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc + ) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor) +endif (WITH_ONNXRUNTIME) + # Create shared inference library cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${SHARED_INFERENCE_DEPS}) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 87efe5ec5190372b48f1bd6387e1c92f456865a1..bdc16ef4c7907764473c552461cde35f011ad489 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) @@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() -cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} - zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +if (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx) + cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor) +else (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +endif (WITH_ONNXRUNTIME) + cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -75,6 +82,16 @@ elseif (WIN32) ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() +if (WITH_ONNXRUNTIME) + if (NOT APPLE AND NOT WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + elseif (WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps} + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + endif() +endif() + if(WITH_TESTING AND WITH_MKLDNN) if (NOT APPLE AND NOT WIN32) cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 9c33d7003064532db7276d0f6dad90e1b2c55104..41c01d3b7e261314d8dc6b852f5b2a597421fe48 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -168,6 +168,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, Update(); } +void AnalysisConfig::EnableONNXRuntime() { +#ifdef PADDLE_WITH_ONNXRUNTIME + use_onnxruntime_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()"; + use_onnxruntime_ = false; +#endif + + Update(); +} + +void AnalysisConfig::DisableONNXRuntime() { + use_onnxruntime_ = false; + Update(); +} + +void AnalysisConfig::EnableORTOptimization() { +#ifdef PADDLE_WITH_ONNXRUNTIME + enable_ort_optimization_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()"; + enable_ort_optimization_ = false; +#endif + + Update(); +} + AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5492c3b0d26453c590e6a0a1350d88b442b789f7..871ed596a3ee9d6362b03e99ca10313765826a51 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -65,6 +65,10 @@ #include "paddle/fluid/inference/api/mkldnn_quantizer.h" #endif +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" +#endif + #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/helper.h" @@ -80,6 +84,8 @@ using inference::tensorrt::TRTCalibratorEngine; using inference::tensorrt::TRTCalibratorEngineManager; #endif +int AnalysisPredictor::clone_num_ = 1; + namespace { bool IsPersistable(const framework::VarDesc *var) { if (var->Persistable() && @@ -1633,7 +1639,7 @@ std::unique_ptr AnalysisPredictor::Clone() { std::lock_guard lk(clone_mutex_); auto *x = new AnalysisPredictor(config_); x->Init(scope_, inference_program_); - x->executor_->ResetTrtOps(++x->clone_num_); + x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_); return std::unique_ptr(x); } @@ -1760,6 +1766,27 @@ namespace paddle_infer { Predictor::Predictor(const Config &config) { const_cast(&config)->SwitchUseFeedFetchOps(false); // The second parameter indicates that the discard log is not printed + if (config.use_onnxruntime()) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (config.use_gpu()) { + LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU," + "and it falls back to use Paddle Inference."; + } else if (!paddle::CheckConvertToONNX(config)) { + LOG(WARNING) + << "Paddle2ONNX do't support convert the Model, fall back to using " + "Paddle Inference."; + } else { + predictor_ = paddle::CreatePaddlePredictor< + Config, paddle::PaddleEngineKind::kONNXRuntime>(config); + return; + } +#else + LOG(WARNING) + << "The onnxruntime backend isn't enabled," + " and please re-compile Paddle with WITH_ONNXRUNTIME option," + "fall back to using Paddle Inference."; +#endif + } predictor_ = paddle::CreatePaddlePredictor< Config, paddle::PaddleEngineKind::kAnalysis>(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 8ed183dae0b1b00f8e0014b2d9b470ac177152f0..21a7e9658bbeeb16d4cbff6364aaef68edcae16d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -486,7 +486,7 @@ class AnalysisPredictor : public PaddlePredictor { bool status_is_cloned_{false}; std::map>> shape_info_; - int clone_num_{1}; + static int clone_num_; #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE) && \ !defined(PADDLE_WITH_ASCEND_CL) diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 9c7e5c6b27e68ee10be5f8b56d6de4aea4524078..2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -357,6 +357,24 @@ TEST(AnalysisPredictor, set_xpu_device_id) { } #endif +TEST(AnalysisPredictor, enable_onnxruntime) { + AnalysisConfig config; + config.EnableONNXRuntime(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.use_onnxruntime()); +#else + ASSERT_TRUE(!config.use_onnxruntime()); +#endif + config.EnableORTOptimization(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.ort_optimization_enabled()); +#else + ASSERT_TRUE(!config.ort_optimization_enabled()); +#endif + config.DisableONNXRuntime(); + ASSERT_TRUE(!config.use_onnxruntime()); +} + } // namespace paddle namespace paddle_infer { @@ -408,6 +426,14 @@ TEST(Predictor, Run) { predictor->TryShrinkMemory(); } +TEST(Predictor, EnableONNXRuntime) { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + auto predictor = CreatePredictor(config); +} + TEST(Tensor, CpuShareExternalData) { Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index d03840ada36bce8cfdc2213284697e6d873cbde0..df98a7b05cf3f2035e9a21ec10e4b44eca843bbd 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -4,6 +4,7 @@ option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL. option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -151,6 +159,17 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -213,6 +232,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef5c08cd041eb7af4c7f17a95c4fd9b8601e4bad --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo of mobilenet for tensorrt. + */ + +#include // use glog instead of CHECK to avoid importing other paddle header files. +#include +#include "gflags/gflags.h" +#include "utils.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle { +namespace demo { + +/* + * Use the onnxruntime engine to inference the demo. + */ +void Main() { + paddle::AnalysisConfig config; + config.EnableONNXRuntime(); + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + auto predictor = paddle_infer::CreatePredictor(config); + + // Inference. + std::vector input_shape = {1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto input_tensor = predictor->GetInputHandle(input_names[0]); + input_tensor->Reshape(input_shape); + auto output_tensor = predictor->GetOutputHandle(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + predictor->Run(); + output_tensor->CopyToCpu(out_data.data()); + + VLOG(3) << "output.size " << out_data.size(); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(); + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 5f062e8063253a08466b2491e80417af07047394..79a31555c7f0b1cb4a8d9c48bae16145d605935b 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset USE_TENSORRT=$5 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr -MSVC_STATIC_CRT=$7 +WITH_ONNXRUNTIME=$7 +MSVC_STATIC_CRT=$8 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -38,6 +39,26 @@ else use_gpu_list='false' fi +mkdir -p $DATA_DIR +cd $DATA_DIR + +if [ $7 == ON ]; then + ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB} + PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB} + #download model + mkdir -p MobileNetV2 + cd MobileNetV2 + if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then + echo "MobileNetV2.inference.model.tar.gz has been downloaded." + else + wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + tar xzf *.tar.gz + fi + cd .. +fi + PREFIX=inference-vis-demos%2F URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} @@ -58,8 +79,7 @@ function download() { fi cd .. } -mkdir -p $DATA_DIR -cd $DATA_DIR + vis_demo_list='se_resnext50 ocr mobilenet' for vis_demo_name in $vis_demo_list; do download $vis_demo_name @@ -93,7 +113,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do Release/simple_on_word2vec.exe \ @@ -112,7 +133,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -138,7 +160,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln Release/trt_mobilenet_demo.exe \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -156,7 +179,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model' if [ -d $word2vec_model ]; then @@ -176,7 +200,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -200,7 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -211,6 +237,26 @@ for WITH_STATIC_LIB in ON OFF; do exit 1 fi fi + + # --------onnxruntime mobilenetv2 on linux/mac------ + if [ $WITH_ONNXRUNTIME == ON ]; then + rm -rf * + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=onnxruntime_mobilenet_demo \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME + make -j$(nproc) + ./onnxruntime_mobilenet_demo \ + --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2 + if [ $? -ne 0 ]; then + echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail." + exit 1 + fi + fi fi done set +x diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee82da139d8f39c26002763c4a4835050c48fc99 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -0,0 +1,354 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid//platform/device/gpu/gpu_types.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { + +framework::proto::VarType::Type ConvertONNXType( + ONNXTensorElementDataType type) { + switch (type) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: + return framework::proto::VarType::FP32; + // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + // return DataType::FP16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: + return framework::proto::VarType::INT8; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: + return framework::proto::VarType::INT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: + return framework::proto::VarType::INT64; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: + return framework::proto::VarType::UINT8; + default: + LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); + return framework::proto::VarType::FP32; + } +} + +bool CheckConvertToONNX(const AnalysisConfig &config) { + if (!config.model_dir().empty()) { + LOG(ERROR) << "Paddle2ONNX not support model_dir config"; + // TODO(heliqi jiangjiajun): Paddle2ONNX not support + // config.model_dir() + "/__model__" + // config.model_dir() + var_name + return false; + } else if (config.prog_file().empty() || config.params_file().empty()) { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s' or params path '%s'.", + config.model_dir(), config.prog_file(), config.params_file()); + return false; + } + return paddle2onnx::IsExportable(config.prog_file(), config.params_file(), + config.model_from_memory()); +} + +bool ONNXRuntimePredictor::Init() { + VLOG(3) << "ONNXRuntime Predictor::init()"; + + // Now ONNXRuntime only suuport CPU + if (config_.use_gpu()) { + place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); + } else { + place_ = paddle::platform::CPUPlace(); + } + scope_.reset(new paddle::framework::Scope()); + sub_scope_ = &scope_->NewScope(); + + std::string onnx_proto; + paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto, + config_.model_from_memory()); + + Ort::SessionOptions session_options; + if (config_.ort_optimization_enabled()) { + session_options.SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_ENABLE_ALL); + } + // Turn optimization off first, and then turn it on when it's stable + // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + // session_options.EnableCpuMemArena(); + // session_options.EnableMemPattern(); + // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads()); + session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads()); + VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads(); + if (config_.profile_enabled()) { + LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the " + "performance"; +#if defined(_WIN32) + session_options.EnableProfiling(L"ONNX"); +#else + session_options.EnableProfiling("ONNX"); +#endif + } else { + VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report " + "will be " + "generated."; + } + session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options}; + + auto memory_info = + Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::Allocator allocator(session_, memory_info); + + framework::proto::VarType::Type proto_type = + framework::proto::VarType::LOD_TENSOR; + size_t n_inputs = session_.GetInputCount(); + for (size_t i = 0; i < n_inputs; ++i) { + auto input_name = session_.GetInputName(i, allocator); + auto type_info = session_.GetInputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); + auto *ptr = scope_->Var(input_name); + framework::InitializeVariable(ptr, proto_type); + allocator.Free(input_name); + } + + size_t n_outputs = session_.GetOutputCount(); + for (size_t i = 0; i < n_outputs; ++i) { + auto output_name = session_.GetOutputName(i, allocator); + auto type_info = session_.GetOutputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); + auto *ptr = scope_->Var(output_name); + framework::InitializeVariable(ptr, proto_type); + allocator.Free(output_name); + } + + return true; +} + +template <> +std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig &config) { + if (config.glog_info_disabled()) { + FLAGS_logtostderr = 1; + FLAGS_minloglevel = 2; // GLOG_ERROR + } + + PADDLE_ENFORCE_EQ( + config.is_valid(), true, + platform::errors::InvalidArgument( + "Note: Each config can only be used for one predictor.")); + + VLOG(3) << "create ONNXRuntimePredictor"; + + std::unique_ptr predictor(new ONNXRuntimePredictor(config)); + // Each config can only be used for one predictor. + config.SetInValid(); + auto predictor_p = dynamic_cast(predictor.get()); + + if (!predictor_p->Init()) { + return nullptr; + } + + return predictor; +} + +std::vector ONNXRuntimePredictor::GetInputNames() { + std::vector input_names; + for (auto input_desc : input_desc_) { + input_names.push_back(input_desc.name); + } + return input_names; +} + +std::map> +ONNXRuntimePredictor::GetInputTensorShape() { + std::map> input_shapes; + for (auto input_desc : input_desc_) { + input_shapes[input_desc.name] = input_desc.shape; + } + return input_shapes; +} + +std::vector ONNXRuntimePredictor::GetOutputNames() { + std::vector output_names; + for (auto output_desc : output_desc_) { + output_names.push_back(output_desc.name); + } + return output_names; +} + +std::unique_ptr ONNXRuntimePredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "scope of the ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()))); + res->input_or_output_ = true; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; +} + +std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The out variable named %s is not found in the " + "scope of the ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()))); + res->input_or_output_ = false; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; +} + +Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, + const char *device_name) { + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + size_t size = + tensor->numel() * + framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); + std::vector shape = phi::vectorize(tensor->dims()); + return Ort::Value::CreateTensor(memory_info, + static_cast(tensor->data()), size, + shape.data(), shape.size(), desc.dtype); +} + +void ONNXRuntimePredictor::AsTensor(const Ort::Value &value, + const ONNXDesc &desc) { + auto info = value.GetTensorTypeAndShapeInfo(); + + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + tensor->Resize(phi::make_ddim(info.GetShape())); + auto dtype = ConvertONNXType(info.GetElementType()); + auto *ptr = tensor->mutable_data(place_, dtype); + + if (platform::is_cpu_place(place_)) { + std::memcpy(ptr, const_cast(value.GetTensorData()), + tensor->numel() * framework::SizeOfType(dtype)); + } else { + auto src_place = place_; + auto dst_place = place_; + memory::Copy(dst_place, ptr, src_place, + const_cast(value.GetTensorData()), + tensor->numel() * framework::SizeOfType(dtype)); + } +} + +bool ONNXRuntimePredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + LOG(ERROR) << "Not support Run"; + return false; +} + +bool ONNXRuntimePredictor::ZeroCopyRun() { + try { + Ort::IoBinding binding(session_); + std::vector inputs; + std::vector outputs; + Ort::RunOptions options; + + inputs.reserve(input_desc_.size()); + const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; + for (auto desc : input_desc_) { + inputs.push_back(GetOrtValue(desc, device_name)); + binding.BindInput(desc.name.c_str(), inputs.back()); + } + + // TODO(heliqi): Optimization —— move to Init() + for (auto desc : output_desc_) { + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + binding.BindOutput(desc.name.c_str(), memory_info); + } + + session_.Run({}, binding); + + outputs = binding.GetOutputValues(); + for (size_t i = 0; i < output_desc_.size(); ++i) { + AsTensor(outputs[i], output_desc_[i]); + } + } catch (const std::exception &e) { + LOG(ERROR) << e.what(); + return false; + } + + return true; +} + +std::unique_ptr ONNXRuntimePredictor::Clone() { + LOG(ERROR) << "Not support Clone(), Please create new Predictor"; + return nullptr; +} + +uint64_t ONNXRuntimePredictor::TryShrinkMemory() { + return paddle::memory::Release(place_); +} + +ONNXRuntimePredictor::~ONNXRuntimePredictor() { + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } + memory::Release(place_); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h new file mode 100644 index 0000000000000000000000000000000000000000..7fb07aa97bd2746773192456ddeba941a24e8906 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -0,0 +1,225 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_compatible_info.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" +#include "paddle/fluid/string/printf.h" + +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT +#include "paddle2onnx/converter.h" + +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif + +/// +/// \file onnxruntime_predictor.h +/// +/// \brief A predictor using ONNXRuntime +/// +/// \author heliqi@baidu.com +/// \date 2022-02-14 +/// \since 2.3.0 +/// + +namespace paddle { + +bool CheckConvertToONNX(const AnalysisConfig &config); + +struct ONNXDesc { + std::string name; + std::vector shape; + ONNXTensorElementDataType dtype; +}; + +/// +/// \class ONNXRuntimePredictor +/// +/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference +/// +/// The predictor has the following typical uses: +/// +/// Get predictor +/// \code{cpp} +/// auto predictor = CreatePaddlePredictor(config); +/// \endcode +/// +/// Get input or output names +/// \code{cpp} +/// auto input_names = predictor->GetInputNames(); +/// auto output_names = predictor->GetOutputNames(); +/// \endcode +/// +/// Get input or output tensors +/// \code{cpp} +/// auto input_t = predictor->GetInputTensor(input_names[0]); +/// auto output_t = predictor->GetOutputTensor(output_names[0]); +/// \endcode +/// +/// Run predictor +/// \code{cpp} +/// predictor->ZeroCopyRun(); +/// \endcode +/// +class ONNXRuntimePredictor : public PaddlePredictor { + public: + /// + /// \brief Construct a new ONNXRuntime Predictor object + /// + /// \param[in] AnalysisConfig config + /// + explicit ONNXRuntimePredictor(const AnalysisConfig &config) + : config_(config) { + predictor_id_ = inference::GetUniqueId(); + env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx"); + } + /// + /// \brief Destroy the ONNXRuntime Predictor object + /// + ~ONNXRuntimePredictor(); + + /// + /// \brief Initialize predictor + /// + /// \return Whether the init function executed successfully + /// + bool Init(); + + /// + /// \brief Get the input names + /// + /// \return input names + /// + std::vector GetInputNames(); + + /// + /// \brief Get the output names + /// + /// \return output names + /// + std::vector GetOutputNames(); + + /// + /// \brief Get the Input Tensor object + /// + /// \param[in] name input name + /// \return input tensor + /// + std::unique_ptr GetInputTensor( + const std::string &name) override; + + /// + /// \brief Get the Output Tensor object + /// + /// \param[in] name otuput name + /// \return output tensor + /// + std::unique_ptr GetOutputTensor( + const std::string &name) override; + /// + /// \brief Get all input names and their corresponding shapes + /// + /// \return the map of input names and shapes + /// + std::map> GetInputTensorShape() override; + + /// Not supoort + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + /// + /// \brief Run the prediction engine + /// + /// \return Whether the function executed successfully + /// + bool ZeroCopyRun() override; + + /// + /// \brief Release all tmp tensor to compress the size of the memory pool. + /// The memory pool is considered to be composed of a list of chunks, if + /// the chunk is not occupied, it can be released. + /// + /// \return Number of bytes released. It may be smaller than the actual + /// released memory, because part of the memory is not managed by the + /// MemoryPool. + /// + uint64_t TryShrinkMemory() override; + /// + /// \brief Clone to get the new predictor. thread safe. + /// + /// \return get a new predictor + /// + std::unique_ptr Clone() override; + + std::shared_ptr scope_; + + private: + /// + /// \brief get the Ort Value(input Tensor). + /// + /// \param[in] desc ONNXDesce(name、shape、dtype) + /// + /// \param[in] device_name "cpu" or "gpu" of device + /// + /// \return get a Ort::Value + /// + Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name); + + /// + /// \brief Ort::Value to Paddle::ZeroCopyTensor. + /// + /// \param[in] value Ort::Value(output Tensor) + /// + /// \param[in] desc a ONNXDesce(name、shape、dtype) + /// + /// \return get a Ort::Value + /// + void AsTensor(const Ort::Value &value, const ONNXDesc &desc); + + private: + AnalysisConfig config_; + + // ONNXRuntime + Ort::Env env_; + Ort::Session session_{nullptr}; + + platform::Place place_; + framework::Scope *sub_scope_{nullptr}; + std::vector input_desc_; + std::vector output_desc_; + int predictor_id_; + +// Some more detailed tests, they are made the friends of the predictor, so that +// the all the details can be tested. +#if PADDLE_WITH_TESTING + FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on); +#endif +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..2be2de9c60bb1c3fdedf13212d50a6f4e155d4df --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/platform/cpu_info.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { + +TEST(ONNXRuntimePredictor, onnxruntime_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname + "/inference.pdmodel", + FLAGS_dirname + "/inference.pdiparams"); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + config.SetCpuMathLibraryNumThreads(2); + LOG(INFO) << config.Summary(); + + auto _predictor = + CreatePaddlePredictor(config); + ASSERT_TRUE(_predictor); + auto* predictor = static_cast(_predictor.get()); + + ASSERT_TRUE(predictor); + ASSERT_TRUE(!predictor->Clone()); + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // Dummy Input Data + std::vector input_shape = {-1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + + // testing all interfaces + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto get_input_shape = predictor->GetInputTensorShape(); + + ASSERT_EQ(input_names.size(), 1UL); + ASSERT_EQ(output_names.size(), 1UL); + ASSERT_EQ(input_names[0], "inputs"); + ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1"); + ASSERT_EQ(get_input_shape["inputs"], input_shape); + + auto input_tensor = predictor->GetInputTensor(input_names[0]); + input_tensor->Reshape({1, 3, 224, 224}); + auto output_tensor = predictor->GetOutputTensor(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + ASSERT_TRUE(predictor->ZeroCopyRun()); + output_tensor->CopyToCpu(out_data.data()); + + predictor->TryShrinkMemory(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index b4a358394404fa7d28838a00c96290747f146a1f..7b765e3fa8a24ef1b81b68da8ba12dd8e5577572 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -319,6 +319,18 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableNpu(int device_id = 0); /// + /// \brief Turn on ONNXRuntime. + /// + void EnableONNXRuntime(); + /// + /// \brief Turn off ONNXRuntime. + /// + void DisableONNXRuntime(); + /// + /// \brief Turn on ONNXRuntime Optimization. + /// + void EnableORTOptimization(); + /// /// \brief A boolean state telling whether the GPU is turned on. /// /// \return bool Whether the GPU is turned on. @@ -342,6 +354,19 @@ struct PD_INFER_DECL AnalysisConfig { /// bool use_ipu() const { return use_ipu_; } /// + /// \brief A boolean state telling whether the ONNXRuntime is turned on. + /// + /// \return bool Whether the ONNXRuntime is turned on. + /// + bool use_onnxruntime() const { return use_onnxruntime_; } + /// + /// \brief A boolean state telling whether the ONNXRuntime Optimization is + /// turned on. + /// + /// \return bool Whether the ONNXRuntime Optimization is turned on. + /// + bool ort_optimization_enabled() const { return enable_ort_optimization_; } + /// /// \brief Get the GPU device id. /// /// \return int The GPU device id. @@ -841,6 +866,10 @@ struct PD_INFER_DECL AnalysisConfig { bool use_npu_{false}; int npu_device_id_{0}; + // ONNXRuntime related + bool use_onnxruntime_{false}; + bool enable_ort_optimization_{false}; + // Padding related bool use_fc_padding_{true}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index c129efe494b4fb36bc72d3c93e24951ba7fef322..657dd9b600cce7173e3aa8d0156ba0975199cf98 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { private: friend class AnalysisPredictor; + friend class ONNXRuntimePredictor; explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {} }; @@ -381,6 +382,7 @@ enum class PaddleEngineKind { kNative = 0, ///< Use the native Fluid facility. kAutoMixedTensorRT, ///< Automatically mix Fluid with TensorRT. kAnalysis, ///< More optimization. + kONNXRuntime, ///< Use ONNXRuntime }; template @@ -395,6 +397,11 @@ template <> PD_INFER_DECL std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config); +template <> +PD_INFER_DECL std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig& config); + PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype); PD_INFER_DECL std::string get_version(); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f5f36d805b43ea0815683e3b65bf157fe5beb2de..22d9dedb32ebfcc229e0034cc5cf6092907dc8df 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -262,6 +262,7 @@ void CpuPassStrategy::EnableMKLDNN() { // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // "softplus_activation_mkldnn_fuse_pass", // + "elt_act_mkldnn_fuse_pass", // // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 // "mkldnn_inplace_pass", // This pass should be activated after diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index e342190fda1aca53a6814806e1afec1335224d79..d7b07652babbd1e24e2c650ac8ac079f03523d12 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { return config->use_gpu(); } +void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableONNXRuntime(); +} + +void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableONNXRuntime(); +} + +PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_onnxruntime(); +} + +void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableORTOptimization(); +} + void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, int32_t l3_workspace_size, PD_Bool locked, PD_Bool autotune, const char* autotune_file, diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index c314aca918f141d30661d9034656899bbb816063..f6b754cad213f8d5249317468b5ceb21e863f6ad 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu( PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( __pd_keep PD_Config* pd_config); /// +/// \brief Turn on ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn off ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the ONNXRutnime is turned on. +/// +/// \return Whether the ONNXRuntime is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization( + __pd_keep PD_Config* pd_config); +/// /// \brief Turn on XPU. /// /// \param[in] pd_onfig config diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index def26913b0a1c082b3a983cea5fa8021c468b59c..8f9f34c06b4768317d6f710ac49a7610a9ef9d6a 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) { C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId)) } +/// +/// \brief Turn on ONNXRuntime. +/// +func (config *Config) EnableONNXRuntime() { + C.PD_ConfigEnableONNXRuntime(config.c) +} + +/// +/// \brief Turn off ONNXRuntime. +/// +func (config *Config) DisableONNXRuntime() { + C.PD_ConfigDisableONNXRuntime(config.c) +} + +/// +/// \brief A boolean state telling whether the ONNXRuntime is turned on. +/// +/// \return bool Whether the ONNXRuntime is turned on. +/// +func (config *Config) ONNXRuntimeEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c)) +} + +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +func (config *Config) EnableORTOptimization() { + C.PD_ConfigEnableORTOptimization(config.c) +} + /// /// \brief Turn on XPU. /// diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go index b82161880839e500a20b787914e2827da151106b..297841dcbcf6c19aef4a536557ec30e76ea9f82c 100644 --- a/paddle/fluid/inference/goapi/config_test.go +++ b/paddle/fluid/inference/goapi/config_test.go @@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) { config.SetBfloat16Op([]string{"fc", "mul"}) } + +func TestONNXRuntime(t *testing.T) { + config := NewConfig() + config.SetModelDir("modelDir") + t.Log(config.ModelDir()) + + config.EnableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.DisableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.EnableORTOptimization() + + config.SetCpuMathLibraryNumThreads(4) + t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads()) +} \ No newline at end of file diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go index 40e518304510c57fec9cd7609ecbd6eefa456050..755558f96238d11842f8245c2b36210c60d8a057 100644 --- a/paddle/fluid/inference/goapi/predictor_test.go +++ b/paddle/fluid/inference/goapi/predictor_test.go @@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) { cloned.ClearIntermediateTensor() } +func TestONNXRuntimePredictor(t *testing.T) { + t.Logf("Version:\n%+v", Version()) + config := NewConfig() + config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams") + config.EnableONNXRuntime() + config.EnableORTOptimization() + predictor := NewPredictor(config) + inNames := predictor.GetInputNames() + t.Logf("InputNames:%+v", inNames) + outNames := predictor.GetOutputNames() + t.Logf("OutputNames:%+v", outNames) + + inHandle := predictor.GetInputHandle(inNames[0]) + inHandle.Reshape([]int32{1, 3, 224, 224}) + t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape()) + + data := make([]float32, numElements([]int32{1, 3, 224, 224})) + for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ { + data[i] = float32(i%255) * 0.1 + } + inHandle.CopyFromCpu(data) + t.Logf("inHandle Type:%+v", inHandle.Type()) + + predictor.Run() + + outHandle := predictor.GetOutputHandle(outNames[0]) + t.Logf("outHandle name:%+v", outHandle.Name()) + + outShape := outHandle.Shape() + t.Logf("outHandle Shape:%+v", outShape) + outData := make([]float32, numElements(outShape)) + outHandle.CopyToCpu(outData) + t.Log(outData) +} + + func TestFromBuffer(t *testing.T) { modelFile, err := os.Open("./mobilenetv1/inference.pdmodel") if err != nil { diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh index edccc2648c012fda9e22c2fc14ffe4f90dc26cfe..cff9fd4aa7ceada2a37d9650c9ce3653f0155447 100644 --- a/paddle/fluid/inference/goapi/test.sh +++ b/paddle/fluid/inference/goapi/test.sh @@ -22,6 +22,7 @@ fi # 2. set LD_LIBRARY_PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/ +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/ # 3. go test go clean -testcache diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 8c61200f7f57cdf57b372c37c8f7cea40c4a8d4c..b69292827aa136fd1d8a1f66d80823e6344a6174 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index a432ff62810aa30c01c1980c80bf3f344039f7dd..f19b21d3e632633d7066c3e9e14cadd2900eb339 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -335,15 +335,37 @@ class MultiheadMatMulOpConverter : public OpConverter { reshape_before_fc_dim.d[4] = 1; auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + if (enable_int8) { + engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0), + in_scale); + } reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); reshape_before_fc_layer->setName( ("shuffle_before_multihead_mamul(Output: " + output_name + ")") .c_str()); // add layer fc - auto* fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n, - weight.get(), bias.get()); + nvinfer1::ILayer* fc_layer = nullptr; + if (enable_int8) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, Convolution, *reshape_before_fc_layer->getOutput(0), n, + nv_ksize, weight.get(), bias.get()); + } else { + fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), + n, weight.get(), bias.get()); + } + + if (enable_int8) { + PADDLE_ENFORCE_EQ( + op_desc.HasAttr("fc_out_threshold"), true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers in int8 mode")); + float out_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } fc_layer->setName( ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); @@ -359,6 +381,10 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin_inputs.push_back(input_bias_qk); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + + if (enable_int8) { + with_fp16 = 1; + } plugin::DynamicPluginTensorRT* plugin = new plugin::QkvToContextPluginDynamic(hidden_in, head_number, head_size, scale, with_fp16); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index fe04d552e40263a396059e3da59de4d51def67e0..7b65d2d7c97cca335f76f1d0399a25bcd8a00c92 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -328,5 +328,5 @@ class Pool2dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc index b8e87a8d94d1f43d35da1a46c300a1b37c9382ec..5a306f622adbe7a298ab53daae1168ad50b402a9 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc @@ -224,5 +224,5 @@ class Pool3dOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(pool3d); +USE_OP_ITSELF(pool3d); REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index f2dc5ba1c7c2c832e0239f6a30760c354aaf4699..1946f9e28388e3ab6d1d580d0f7d91c1ef3e604f 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace inference } // namespace paddle -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(sigmoid); -USE_OP(tanh); +USE_OP_ITSELF(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc index 95916746d6fcb528d26a8f8bb39980b55c4f3704..b96992ef8514abe0f71dbf23d38abb626f6c4a5b 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP(conv2d_transpose); namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index 474fd92071fb0795b868f0cd86591061cf8b6581..cf377396087637f115523ddc60a468e2a23d57d4 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc index 1725888abc379bfa4ffbbc5cfc4cecd1872c7c18..f17e00de0eeb7c8f4d782f0a4eaecc2fc1df268b 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -45,4 +45,4 @@ TEST(leaky_relu_op, test_leaky_relu) { } // namespace paddle // USE_OP(leaky_relu); -USE_OP(leaky_relu); +USE_OP_ITSELF(leaky_relu); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc index bded833505cd25352adc4123de415613d1fc926d..36f13262a73d703a6d9776855adbab3c44075aa7 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -71,4 +71,4 @@ TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } } // namespace inference } // namespace paddle -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index b2764ca61c11219e5546867813157b7f05ee3ce8..d53a8923af6120adb460d95fc81820b6dfa03a60 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -54,6 +54,8 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kFLOAT; case FluidDT::VarType_Type_INT32: return TRT_DT::kINT32; + case FluidDT::VarType_Type_FP16: + return TRT_DT::kHALF; default: return TRT_DT::kINT32; } diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu index 861a9aa9d000bff9e6dcc673cc5c8d99c3a7a6ec..5596a89a083fe9ff177aa9abc769b8fa27105c1f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -108,16 +108,14 @@ int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool3d_type_ == Pool3DType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); } else if (pool3d_type_ == Pool3DType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, adaptive_, odatas[0], stream, pool_process); @@ -351,16 +349,14 @@ int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool3d_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); } else if (pool3d_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool3dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool3dDirectCUDAFunctor, float> pool3d_forward; pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 6d711c26adc6ff8e49375d15f32322303f3ae6ef..9bfe98d759d8e29bc34b42fa667e5cda5f1493de 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace inference { @@ -84,16 +84,14 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, output_shape.insert(output_shape.begin(), batchSize); if (pool_type_ == PoolType::max) { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, true, false, odatas[0], stream, pool_process); } else if (pool_type_ == PoolType::avg) { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, exclusive_, adaptive_, odatas[0], stream, @@ -292,16 +290,14 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, } if (pool_type_ == "max") { - paddle::operators::math::MaxPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::MaxPool, float> + phi::funcs::MaxPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, true, false, output, stream, pool_process); } else if (pool_type_ == "avg") { - paddle::operators::math::AvgPool pool_process; - paddle::operators::math::Pool2dDirectCUDAFunctor< - paddle::operators::math::AvgPool, float> + phi::funcs::AvgPool pool_process; + phi::funcs::Pool2dDirectCUDAFunctor, float> pool2d_forward; pool2d_forward(input, input_shape, output_shape, ksize, strides_, paddings, exclusive_, adaptive_, output, stream, pool_process); diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index 57177cfa8b421e1d79004bb1a7f738d98dc00f97..336005d883b0f523213060645e540c35a14e4e9c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -16,7 +16,6 @@ #include #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc index df0eb58c2bd587e69215602512cc51f19c97a978..a341ffd7a081c24500e3b061b0ce3510a2aaacbc 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -81,6 +81,18 @@ TEST(PD_Config, interface) { PD_ConfigSetBfloat16Op(config, 1, &ops_name); #endif + PD_ConfigEnableONNXRuntime(config); + bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config); +#ifdef PADDLE_WITH_ONNXRUNTIME + EXPECT_TRUE(onnxruntime_enabled); +#else + EXPECT_FALSE(onnxruntime_enabled); +#endif + PD_ConfigDisableONNXRuntime(config); + bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config); + EXPECT_FALSE(onnxruntime_disabled); + PD_ConfigEnableORTOptimization(config); + PD_ConfigEnableMemoryOptim(config, true); bool memory_enabled = PD_ConfigMemoryOptimEnabled(config); EXPECT_TRUE(memory_enabled); diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 9d83f8ff8fdc4756450c0fe9ae4d7096d9afa76f..f376cbd4fb302b1d7a038d958465f24db653e220 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -5,6 +5,7 @@ option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF) option(USE_TENSORRT "Compile demo with TensorRT." OFF) option(WITH_GTEST "Compile demo with GTEST" OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -172,6 +180,16 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -248,6 +266,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index dd4b64f28d739776ee750205d41b4dce35a97640..8123d3785003471fd5f63f24fbb1166913d7e571 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT -MSVC_STATIC_CRT=$6 +WITH_ONNXRUNTIME=$6 +MSVC_STATIC_CRT=$7 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir EXIT_CODE=0 # init default exit code WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -144,7 +145,8 @@ function compile_test() { -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DWITH_GTEST=ON \ -DCMAKE_CXX_FLAGS='/std:c++17' \ - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj else cmake .. -DPADDLE_LIB=${inference_install_dir} \ @@ -154,7 +156,8 @@ function compile_test() { -DWITH_STATIC_LIB=OFF \ -DUSE_TENSORRT=$USE_TENSORRT \ -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON + -DWITH_GTEST=ON \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) fi; cd - diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 05c468b798886ac135ed30bff75ce9400f1ca3a1..6b6c0cd22f03b902f08d7a79236b1091b9fe6677 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc endif() set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") +if(WITH_ONNXRUNTIME) + set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2") + if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz) + inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz") + endif() + set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2") +endif() + function (inference_base_test_build TARGET) set(options "") set(oneValueArgs "") diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 6cd7d87332323f4bafd49b8b16254f9610405658..f296ce96d4e5f6dca5c4ad2668eea8508b37068f 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -17,7 +17,7 @@ if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) nv_library(cuda_managed_allocator SRCS cuda_managed_allocator.cc DEPS allocator cuda_device_guard gpu_info) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) - nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator) + nv_library(stream_safe_cuda_allocator SRCS stream_safe_cuda_allocator.cc DEPS allocator cuda_graph) nv_library(thread_local_allocator SRCS thread_local_allocator.cc DEPS allocator) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) @@ -131,4 +131,7 @@ cc_library(virtual_memory_auto_growth_best_fit_allocator SRCS virtual_memory_aut if(NOT WIN32) cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator) cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator) + if (WITH_GPU) + cc_library(cuda_ipc_allocator SRCS cuda_ipc_allocator.cc DEPS allocator) + endif() endif(NOT WIN32) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4d0e485285146e5668793d29fd8effc789fcc339..61e292a922f0e98a958d4fe2f8fc7850bdf47e18 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -193,10 +193,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); ++dev_id) { InitNaiveBestFitCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id)); @@ -210,12 +210,7 @@ class AllocatorFacadePrivate { InitNaiveBestFitCPUAllocator(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) allow_free_idle_chunk_ = allow_free_idle_chunk; - if (FLAGS_use_stream_safe_cuda_allocator) { - for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); - ++dev_id) { - InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr); - } - } else { + if (!FLAGS_use_stream_safe_cuda_allocator) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) { InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), @@ -240,10 +235,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); ++dev_id) { InitAutoGrowthCustomDeviceAllocator( platform::CustomPlace(dev_type, dev_id), allow_free_idle_chunk); @@ -298,6 +293,12 @@ class AllocatorFacadePrivate { } CheckAllocThreadSafe(); + +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + WrapCUDAGraphAllocator(); + } +#endif } inline const std::shared_ptr& GetAllocator( @@ -388,39 +389,6 @@ class AllocatorFacadePrivate { allocation.get())); return stream_safe_cuda_allocation->GetOwningStream(); } - -#ifdef PADDLE_WITH_CUDA - void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, - platform::errors::InvalidArgument( - "CUDA Graph is only supported when the " - "FLAGS_allocator_strategy=\"auto_growth\", but got " - "FLAGS_allocator_strategy=\"%s\"", - FLAGS_allocator_strategy)); - auto& allocator = cuda_graph_allocator_map_[id]; - PADDLE_ENFORCE_EQ( - allocator.get(), nullptr, - platform::errors::InvalidArgument( - "The memory pool of the CUDA Graph with ID %d have been prepared.", - id)); - allocator.reset( - new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); - for (auto& item : allocator->allocators_) { - auto& old_allocator = item.second; - old_allocator = CUDAGraphAllocator::Create(old_allocator); - } - VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; - } - - void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), - platform::errors::InvalidArgument( - "Cannot find CUDA Graph with ID = %d", id)); - cuda_graph_allocator_map_.erase(iter); - VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; - } -#endif #endif private: @@ -439,24 +407,7 @@ class AllocatorFacadePrivate { platform::Place place_; }; - const AllocatorMap& GetAllocatorMap() { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { - auto id = platform::CUDAGraph::CapturingID(); - auto iter = cuda_graph_allocator_map_.find(id); - PADDLE_ENFORCE_NE( - iter, cuda_graph_allocator_map_.end(), - platform::errors::PermissionDenied( - "No memory pool is prepared for CUDA Graph capturing.")); - VLOG(10) << "Choose CUDA Graph memory pool to allocate memory"; - return iter->second->allocators_; - } else { - return allocators_; - } -#else - return allocators_; -#endif - } + const AllocatorMap& GetAllocatorMap() { return allocators_; } void InitNaiveBestFitCPUAllocator() { allocators_[platform::CPUPlace()] = @@ -672,10 +623,10 @@ class AllocatorFacadePrivate { } void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) { - const std::shared_ptr& underlying_allocator = - cuda_allocators_[p][stream]; - cuda_allocators_[p][stream] = std::make_shared( - underlying_allocator, p, stream); + std::shared_ptr& allocator = cuda_allocators_[p][stream]; + allocator = std::make_shared( + allocator, p, stream, + /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); } void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, @@ -684,10 +635,19 @@ class AllocatorFacadePrivate { retry_time, 0, platform::errors::InvalidArgument( "Retry time should be larger than 0, but got %d", retry_time)); - std::shared_ptr allocator = cuda_allocators_[p][stream]; + std::shared_ptr& allocator = cuda_allocators_[p][stream]; allocator = std::make_shared(allocator, retry_time); } +#ifdef PADDLE_WITH_CUDA + void WrapCUDAGraphAllocator() { + for (auto& item : allocators_) { + auto& allocator = item.second; + allocator = CUDAGraphAllocator::Create(allocator); + } + } +#endif + static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) { for (auto& place_pair : allocators) { for (auto& stream_pair : place_pair.second) { @@ -738,7 +698,7 @@ class AllocatorFacadePrivate { auto custom_allocator = std::make_shared(p); allocators_[p] = std::make_shared( - custom_allocator, platform::DeviceManager::GetMinChunkSize(p), + custom_allocator, phi::DeviceManager::GetMinChunkSize(p), allow_free_idle_chunk); } #endif @@ -814,11 +774,10 @@ class AllocatorFacadePrivate { } #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE - auto device_types = platform::DeviceManager::GetAllCustomDeviceTypes(); + auto device_types = phi::DeviceManager::GetAllCustomDeviceTypes(); for (const auto& dev_type : device_types) { for (size_t dev_id = 0; - dev_id < platform::DeviceManager::GetDeviceCount(dev_type); - dev_id++) { + dev_id < phi::DeviceManager::GetDeviceCount(dev_type); dev_id++) { places.emplace_back(platform::CustomPlace(dev_type, dev_id)); } } @@ -865,10 +824,6 @@ class AllocatorFacadePrivate { // a standalone CUDA allocator to support multi-stream GC in new executor CUDAAllocatorMap cuda_allocators_; std::shared_timed_mutex cuda_allocator_mutex_; -#ifdef PADDLE_WITH_CUDA - std::unordered_map> - cuda_graph_allocator_map_; -#endif #endif AllocatorStrategy strategy_; AllocatorMap allocators_; @@ -887,8 +842,24 @@ AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} AllocatorFacade::~AllocatorFacade() {} AllocatorFacade& AllocatorFacade::Instance() { - static AllocatorFacade instance; - return instance; + static AllocatorFacade* instance = new AllocatorFacade; + return *instance; +} + +AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + VLOG(10) << "Choose CUDA Graph memory pool"; + return iter->second.get(); + } +#endif + return m_; } const std::shared_ptr& AllocatorFacade::GetAllocator( @@ -896,19 +867,14 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - + AllocatorFacadePrivate* m = GetPrivate(); platform::CUDAPlace cuda_place(place.GetDeviceId()); - return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place)); + return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } void* AllocatorFacade::GetBasePtr( @@ -923,7 +889,7 @@ void* AllocatorFacade::GetBasePtr( "GetBasePtr() is only implemented for CUDAPlace(), not " "suppot place: %s", allocation->place())); - return m_->GetBasePtr(allocation); + return GetPrivate()->GetBasePtr(allocation); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -931,21 +897,17 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( const platform::Place& place, const gpuStream_t& stream) { if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, - /* A non-zero num to choose allocator_ */ 1); - } -#endif - return m_->GetAllocator(place, stream, /*create_if_not_found=*/true); + return GetPrivate()->GetAllocator(place, stream, + /*create_if_not_found=*/true); } - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); + return GetPrivate()->GetAllocator( + place, /* A non-zero num to choose allocator_ */ 1); } #endif const std::shared_ptr& AllocatorFacade::GetZeroAllocator( const platform::Place& place) { - return m_->GetAllocator(place, /* zero size */ 0); + return GetPrivate()->GetAllocator(place, /* zero size */ 0); } std::shared_ptr AllocatorFacade::AllocShared( @@ -958,43 +920,30 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && size > 0 && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_->GetAllocator(place, size)->Allocate(size); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place)); + phi::Stream default_stream = phi::Stream(reinterpret_cast( + GetPrivate()->GetDefaultStream(cuda_place))); + return Alloc(cuda_place, size, default_stream); } #endif - - return m_->GetAllocator(place, size)->Allocate(size); + return GetPrivate()->GetAllocator(place, size)->Allocate(size); } uint64_t AllocatorFacade::Release(const platform::Place& place) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) && FLAGS_use_system_allocator == false) { -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - return m_ - ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) - ->Release(place); - } -#endif - platform::CUDAPlace cuda_place(place.GetDeviceId()); - return Release(cuda_place, m_->GetDefaultStream(cuda_place)); + return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place)); } #endif - return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) + return GetPrivate() + ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->Release(place); } std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, const phi::Stream& stream) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( @@ -1002,71 +951,53 @@ std::shared_ptr AllocatorFacade::AllocShared( "multi-stream 'AllocaShared' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return std::shared_ptr(Alloc(place, size, s)); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); -#endif + return std::shared_ptr(Alloc(place, size, stream)); } -bool AllocatorFacade::InSameStream( - const std::shared_ptr& allocation, - const phi::Stream& stream) { +AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'InSameStream' function. To enable it, you can enter" + "multi-stream 'Alloc' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); + platform::CUDAPlace p(place.GetDeviceId()); + if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { + gpuStream_t s = reinterpret_cast(stream.id()); + return GetPrivate() + ->GetAllocator(p, s, /* create_if_not_found = */ true) + ->Allocate(size); + } else { + return GetPrivate()->GetAllocator(p, size)->Allocate(size); } -#endif - gpuStream_t s = reinterpret_cast(stream.id()); - return s == GetStream(allocation); #else PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif } +bool AllocatorFacade::InSameStream( + const std::shared_ptr& allocation, + const phi::Stream& stream) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( FLAGS_use_stream_safe_cuda_allocator, true, platform::errors::Unimplemented( "StreamSafeCUDAAllocator is disabled, you should not call this " - "multi-stream 'Alloc' function. To enable it, you can enter" + "multi-stream 'InSameStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } + gpuStream_t s = reinterpret_cast(stream.id()); + return s == GetStream(allocation); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU.")); #endif - platform::CUDAPlace p(place.GetDeviceId()); - if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) { - return m_->GetAllocator(p, stream, /* create_if_not_found = */ true) - ->Allocate(size); - } else { - return m_->GetAllocator(p, size)->Allocate(size); - } } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { PADDLE_ENFORCE_EQ( @@ -1076,15 +1007,7 @@ uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place, "multi-stream 'Release' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetAllocator(place, stream)->Release(place); + return GetPrivate()->GetAllocator(place, stream)->Release(place); } void AllocatorFacade::RecordStream(std::shared_ptr allocation, @@ -1096,15 +1019,7 @@ void AllocatorFacade::RecordStream(std::shared_ptr allocation, "'RecordStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - m_->RecordStream(allocation, stream); + GetPrivate()->RecordStream(allocation, stream); } const gpuStream_t& AllocatorFacade::GetStream( @@ -1116,24 +1031,34 @@ const gpuStream_t& AllocatorFacade::GetStream( "'GetStream' function. To enable it, you can enter" "'export FLAGS_use_stream_safe_cuda_allocator=true' in the " "terminal.")); - -#ifdef PADDLE_WITH_CUDA - if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { - PADDLE_THROW(platform::errors::Unavailable( - "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator")); - } -#endif - - return m_->GetStream(allocation); + return GetPrivate()->GetStream(allocation); } #ifdef PADDLE_WITH_CUDA void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { - return m_->PrepareMemoryPoolForCUDAGraph(id); + PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; } void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { - return m_->RemoveMemoryPoolOfCUDAGraph(id); + auto iter = cuda_graph_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; } #endif #endif diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 1722a06b01f1302c3bb1f98c99af0431ab62f955..9066bb284e28af197111b5d3ea129cc65b5fe914 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -49,6 +49,8 @@ class AllocatorFacade { static AllocatorFacade& Instance(); + AllocatorFacadePrivate* GetPrivate() const; + const std::shared_ptr& GetAllocator(const platform::Place& place); void* GetBasePtr(const std::shared_ptr& allocation); @@ -73,13 +75,14 @@ class AllocatorFacade { size_t size, const phi::Stream& stream); + AllocationPtr Alloc(const platform::Place& place, size_t size, + const phi::Stream& stream); + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed. - AllocationPtr Alloc(const platform::Place& place, size_t size, - const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); void RecordStream(std::shared_ptr allocation, const gpuStream_t& stream); @@ -96,6 +99,10 @@ class AllocatorFacade { private: AllocatorFacade(); AllocatorFacadePrivate* m_; +#ifdef PADDLE_WITH_CUDA + std::unordered_map> + cuda_graph_map_; +#endif }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..b2f24d5aed1eb827b4857f5936a19b206a38c788 --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 + +#include "paddle/fluid/memory/allocation/cuda_ipc_allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" + +#include +#include +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +namespace { +std::mutex ipc_mutex_; +std::unordered_map> ipc_handle_to_baseptr_; +} // namespace + +std::shared_ptr GetIpcBasePtr(std::string handle) { + std::lock_guard lock(ipc_mutex_); + + auto iter = ipc_handle_to_baseptr_.find(handle); + if (iter != ipc_handle_to_baseptr_.end()) { + auto baseptr = iter->second.lock(); + if (baseptr) return baseptr; + } + // The IpcMemHandle can only open once for the same handle, + // so here we cache it here. + void *baseptr = nullptr; + auto ipc_handle = + reinterpret_cast(handle.c_str()); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcOpenMemHandle( + &baseptr, *ipc_handle, cudaIpcMemLazyEnablePeerAccess)); + // Close ipc handle on the same device. + int device_id = platform::GetCurrentDeviceId(); + // Add deleter to close ipc handle. + auto sp = std::shared_ptr(baseptr, [handle, device_id](void *ptr) { + platform::CUDADeviceGuard guard(device_id); + std::lock_guard lock(ipc_mutex_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaIpcCloseMemHandle(ptr)); + ipc_handle_to_baseptr_.erase(handle); + VLOG(6) << "cudaIpcCloseMemHandle for ptr:" + << "\t" << ptr; + }); + std::weak_ptr wp = sp; + ipc_handle_to_baseptr_.insert(iter, {handle, wp}); + + return sp; +} + +CudaIpcAllocation::~CudaIpcAllocation() { + shared_ptr_.reset(); + VLOG(6) << "tensor deleted cudaIpcCloseMemHandle for ptr:" + << "\t" << this->ptr(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/cuda_ipc_allocator.h b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..52e3cf10ea73a787d87d19beeedcdedca1e3dd3b --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_ipc_allocator.h @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 +#pragma once + +#include +#include // NOLINT +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +std::shared_ptr GetIpcBasePtr(std::string handle); + +class CudaIpcAllocation : public Allocation { + public: + explicit CudaIpcAllocation(void *ptr, size_t size, int device_id, + std::shared_ptr shared_ptr) + : Allocation(ptr, size, platform::CUDAPlace(device_id)), + device_id_(std::move(device_id)), + shared_ptr_(std::move(shared_ptr)) {} + + inline const int &device_id() const { return device_id_; } + + ~CudaIpcAllocation() override; + + private: + int device_id_; + std::shared_ptr shared_ptr_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/allocation/custom_allocator.cc b/paddle/fluid/memory/allocation/custom_allocator.cc index bd52c8f4ad270f0f70a23ab39b78bd9363ede769..e53d7b1cc766a3e277ef0a773671ef678bcb3ac7 100644 --- a/paddle/fluid/memory/allocation/custom_allocator.cc +++ b/paddle/fluid/memory/allocation/custom_allocator.cc @@ -32,17 +32,16 @@ void CustomAllocator::FreeImpl(phi::Allocation* allocation) { } phi::Allocation* CustomAllocator::AllocateImpl(size_t size) { - std::call_once(once_flag_, - [this] { platform::DeviceManager::SetDevice(place_); }); + std::call_once(once_flag_, [this] { phi::DeviceManager::SetDevice(place_); }); void* ptr = - platform::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); + phi::DeviceManager::GetDeviceWithPlace(place_)->MemoryAllocate(size); if (LIKELY(ptr)) { return new Allocation(ptr, size, place_); } size_t avail, total; - platform::DeviceManager::MemoryStats(place_, &total, &avail); + phi::DeviceManager::MemoryStats(place_, &total, &avail); auto dev_type = platform::PlaceHelper::GetDeviceType(place_); auto dev_id = platform::PlaceHelper::GetDeviceId(place_); diff --git a/paddle/fluid/memory/allocation/mmap_allocator.cc b/paddle/fluid/memory/allocation/mmap_allocator.cc index acaf5d548555cc3ee69bc5a03309645006256487..25c2235cce85369babc4d601de96c7475a0b1fbd 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.cc +++ b/paddle/fluid/memory/allocation/mmap_allocator.cc @@ -29,6 +29,155 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName() { + static std::random_device rd; + std::string handle = "/paddle_"; +#ifdef _WIN32 + handle += std::to_string(GetCurrentProcessId()); +#else + handle += std::to_string(getpid()); +#endif + handle += "_"; + handle += std::to_string(rd()); + return handle; +} + +struct CountInfo { + std::atomic refcount; +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **map_ptr_, int *fd_) { + // TODO(@ZHUI): support win32 + int file_flags = 0; + int fd = -1; + if (flags & MAPPED_SHAREDMEM) { + file_flags = O_RDWR | O_CREAT; + } else { + file_flags = O_RDONLY; + } + if (flags & MAPPED_EXCLUSIVE) { + file_flags |= O_EXCL; + } + if (flags & MAPPED_NOCREATE) { + file_flags &= ~O_CREAT; + } + + if (!(flags & MAPPED_FROMFD)) { + if (flags & MAPPED_SHAREDMEM) { + fd = shm_open(filename.c_str(), file_flags, (mode_t)0600); + PADDLE_ENFORCE_NE( + fd, -1, + platform::errors::Unavailable( + "File descriptor %s open failed, unable in read-write mode", + filename.c_str())); + VLOG(6) << "shm_open: " << filename; + } + } else { + fd = -1; + } + + PADDLE_ENFORCE_EQ(ftruncate(fd, size), 0, + platform::errors::Unavailable( + "Fruncate a file to a specified length failed!")); + + if (flags & MAPPED_SHAREDMEM) { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + } else { + *map_ptr_ = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + } + + PADDLE_ENFORCE_NE(*map_ptr_, MAP_FAILED, + platform::errors::Unavailable( + "Memory map failed when create shared memory.")); + + if (flags & MAPPED_KEEPFD) { + *fd_ = fd; + } else { + PADDLE_ENFORCE_NE(::close(fd), -1, + platform::errors::Unavailable( + "Error closing memory maped file <", filename, ">")); + + *fd_ = -1; + } +} + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size) { + int fd = -1; + void *base_ptr = nullptr; + AllocateMemoryMap(filename, flags, size + mmap_alignment, &base_ptr, &fd); + void *aliged_base_ptr = + static_cast(static_cast(base_ptr) + mmap_alignment); + return std::make_shared(aliged_base_ptr, size, + filename, flags, fd); +} + +RefcountedMemoryMapAllocation::RefcountedMemoryMapAllocation( + void *ptr, size_t size, std::string ipc_name, int fd, int flags) + : MemoryMapAllocation(ptr, size, ipc_name, fd, flags) { + // must reset base ptr first. + resetBaseptr(); + initializeRefercount(); +} + +void MemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; +} + +MemoryMapAllocation::~MemoryMapAllocation() { close(); } + +void RefcountedMemoryMapAllocation::incref() { + CountInfo *info = static_cast(map_ptr_); + ++info->refcount; +} + +int RefcountedMemoryMapAllocation::decref() { + CountInfo *info = static_cast(map_ptr_); + return --info->refcount == 0; +} + +void RefcountedMemoryMapAllocation::resetBaseptr() { + map_ptr_ = + static_cast(static_cast(map_ptr_) - mmap_alignment); + map_size_ = map_size_ + mmap_alignment; +} + +void RefcountedMemoryMapAllocation::initializeRefercount() { + CountInfo *info = reinterpret_cast(map_ptr_); + + if (flags_ & MAPPED_EXCLUSIVE) { + new (&info->refcount) std::atomic(1); + } else { + info->refcount++; + } +} + +void RefcountedMemoryMapAllocation::close() { + if (closed_) { + return; + } + closed_ = true; + void *data = map_ptr_; + CountInfo *info = reinterpret_cast(data); + if (--info->refcount == 0) { + PADDLE_ENFORCE_NE( + shm_unlink(ipc_name_.c_str()), -1, + platform::errors::Unavailable( + "could not unlink the shared memory file ", ipc_name_)); + VLOG(6) << "shm_unlink file: " << ipc_name_; + } + + PADDLE_ENFORCE_NE( + munmap(map_ptr_, map_size_), -1, + platform::errors::Unavailable("could not unmap the shared memory file: ", + strerror(errno), " (", errno, ")")); +} + MemoryMapWriterAllocation::~MemoryMapWriterAllocation() { PADDLE_ENFORCE_NE( munmap(this->ptr(), this->size()), -1, @@ -44,30 +193,30 @@ MemoryMapReaderAllocation::~MemoryMapReaderAllocation() { /* Here we do not pay attention to the result of shm_unlink, because the memory mapped file may have been cleared due to the MemoryMapFdSet::Clear() */ + + // Code of DataLoader subprocess: + // + // core._array_to_share_memory_tensor(b) + // out_queue.put((idx, tensor_list, structure)) + // core._remove_tensor_list_mmap_fds(tensor_list) + + /* If the tensor in already in the send queue, the tensor will be + * deconstructed by the function. If the tensor not send yet, it + * will be cleared by MemoryMapFdSet::Clear(). + * If the `_remove_tensor_list_mmap_fds` have be interrupted, the + * tensor will be cleared by both methods. + * */ + shm_unlink(this->ipc_name().c_str()); MemoryMapFdSet::Instance().Remove(this->ipc_name()); VLOG(3) << "~MemoryMapReaderAllocation: " << this->ipc_name(); } -std::string GetIPCName() { - static std::random_device rd; - std::string handle = "/paddle_"; -#ifdef _WIN32 - handle += std::to_string(GetCurrentProcessId()); -#else - handle += std::to_string(getpid()); -#endif - handle += "_"; - handle += std::to_string(rd()); - return handle; -} - std::shared_ptr AllocateMemoryMapWriterAllocation( size_t size) { const std::string &ipc_name = GetIPCName(); int flags = O_RDWR | O_CREAT; - - int fd = shm_open(ipc_name.c_str(), flags, 0644); + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); @@ -86,12 +235,14 @@ std::shared_ptr AllocateMemoryMapWriterAllocation( std::shared_ptr RebuildMemoryMapReaderAllocation( const std::string &ipc_name, size_t size) { - int fd = shm_open(ipc_name.c_str(), O_RDONLY, 0644); + int flags = O_RDWR | O_CREAT; + flags &= ~O_CREAT; + + int fd = shm_open(ipc_name.c_str(), flags, 0600); PADDLE_ENFORCE_NE( fd, -1, platform::errors::Unavailable("File descriptor %s open failed", ipc_name.c_str())); - - void *ptr = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + void *ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); PADDLE_ENFORCE_NE(ptr, MAP_FAILED, platform::errors::Unavailable( "Memory map failed when rebuild shared memory.")); diff --git a/paddle/fluid/memory/allocation/mmap_allocator.h b/paddle/fluid/memory/allocation/mmap_allocator.h index 3f91e5c42780826ae0ef2e61e982da2336d10a3f..4f8dbfbb51e66db227dfcf46bc3ce313d8406dd1 100644 --- a/paddle/fluid/memory/allocation/mmap_allocator.h +++ b/paddle/fluid/memory/allocation/mmap_allocator.h @@ -16,8 +16,9 @@ #ifndef _WIN32 +#include #include -#include // NOLINT +#include #include #include #include @@ -28,6 +29,72 @@ namespace paddle { namespace memory { namespace allocation { +std::string GetIPCName(); + +static constexpr int64_t mmap_alignment = 64; + +enum MappedModes { + MAPPED_SHAREDMEM = 1, + MAPPED_EXCLUSIVE = 2, + MAPPED_NOCREATE = 4, + MAPPED_KEEPFD = 8, + MAPPED_FROMFD = 16, + MAPPED_UNLINK = 32 +}; + +class MemoryMapAllocation : public Allocation { + public: + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + map_ptr_(ptr), + map_size_(size) {} + explicit MemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd) + : Allocation(ptr, size, platform::CPUPlace()), + ipc_name_(std::move(ipc_name)), + fd_(fd), + flags_(flags), + map_ptr_(ptr), + map_size_(size) {} + + inline const std::string &ipc_name() const { return ipc_name_; } + + virtual void close(); + + ~MemoryMapAllocation() override; + + protected: + std::string ipc_name_; + int fd_ = -1; + int flags_ = 0; + void *map_ptr_ = nullptr; + size_t map_size_ = 0; + bool closed_ = false; +}; + +class RefcountedMemoryMapAllocation : public MemoryMapAllocation { + public: + RefcountedMemoryMapAllocation(void *ptr, size_t size, std::string ipc_name, + int flags, int fd); + + void incref(); + int decref(); + void close() override; + virtual ~RefcountedMemoryMapAllocation() { close(); } + + protected: + void initializeRefercount(); + void resetBaseptr(); +}; + +void AllocateMemoryMap(std::string filename, int flags, size_t size, + void **base_ptr_, int *fd_); + +std::shared_ptr +AllocateRefcountedMemoryMapAllocation(std::string filename, int flags, + size_t size); + class MemoryMapWriterAllocation : public Allocation { public: explicit MemoryMapWriterAllocation(void *ptr, size_t size, diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index ea6d7019be6c1caf4844469276f3113525b33dfc..0bfbe2c6962294fc7e4aa2fff079e9cf411f26f8 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -739,7 +739,7 @@ class BuddyAllocatorList { private: explicit BuddyAllocatorList(const std::string &device_type) : device_type_(device_type) { - auto devices = platform::DeviceManager::GetDeviceList(device_type); + auto devices = phi::DeviceManager::GetDeviceList(device_type); for (auto dev_id : devices) { init_flags_[dev_id].reset(new std::once_flag()); } @@ -766,15 +766,15 @@ class BuddyAllocatorList { device_type_, dev_id)); std::call_once(*init_flags_[dev_id], [this, dev_id] { - platform::DeviceManager::SetDevice(device_type_, dev_id); + phi::DeviceManager::SetDevice(device_type_, dev_id); platform::CustomPlace place(device_type_, dev_id); allocators_[dev_id].reset(new BuddyAllocator( std::unique_ptr( new detail::CustomAllocator(device_type_, dev_id)), - platform::DeviceManager::GetMinChunkSize(place), - platform::DeviceManager::GetMaxChunkSize(place), - platform::DeviceManager::GetExtraPaddingSize(place), device_type_)); + phi::DeviceManager::GetMinChunkSize(place), + phi::DeviceManager::GetMaxChunkSize(place), + phi::DeviceManager::GetExtraPaddingSize(place), device_type_)); }); return allocators_[dev_id].get(); @@ -808,9 +808,9 @@ void *Alloc(const platform::CustomPlace &place, auto *ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { - platform::DeviceGuard guard(place); + phi::DeviceGuard guard(place); size_t avail, total; - platform::DeviceManager::MemoryStats(place, &total, &avail); + phi::DeviceManager::MemoryStats(place, &total, &avail); PADDLE_THROW(platform::errors::ResourceExhausted( "Cannot allocate %s in %s:%d, avaliable %s, total %s, used " "%s. ", @@ -819,8 +819,7 @@ void *Alloc(const platform::CustomPlace &place, string::HumanReadableSize(total - avail))); } else { if (FLAGS_init_allocated_mem) { - platform::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, - size); + phi::DeviceManager::GetDeviceWithPlace(place)->MemorySet(ptr, 0xEF, size); } } VLOG(10) << " pointer=" << ptr; diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc index 8627e3e6f8811e162ce3014c01145f331a03ee4b..072c4dee3bc45b4ff5f23f5288d3412a14f63b0f 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc @@ -15,56 +15,52 @@ #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" +#endif + namespace paddle { namespace memory { namespace allocation { StreamSafeCUDAAllocation::StreamSafeCUDAAllocation( - DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream) + DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream, + StreamSafeCUDAAllocator* allocator) : Allocation(underlying_allocation->ptr(), underlying_allocation->base_ptr(), underlying_allocation->size(), underlying_allocation->place()), underlying_allocation_(std::move(underlying_allocation)), - owning_stream_(std::move(owning_stream)) {} + owning_stream_(std::move(owning_stream)), + allocator_(allocator->shared_from_this()) {} void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) { VLOG(8) << "Try record stream " << stream << " for address " << ptr(); if (stream == owning_stream_) { - VLOG(9) << "Record the same stream of " << stream; return; } std::lock_guard lock_guard(outstanding_event_map_lock_); - gpuEvent_t record_event; - auto it = outstanding_event_map_.find(stream); - if (it == outstanding_event_map_.end()) { - gpuEvent_t new_event; #ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); -#endif - outstanding_event_map_[stream] = new_event; - record_event = new_event; - VLOG(9) << "Create a new event " << new_event; - } else { - record_event = it->second; - VLOG(9) << "Reuse event " << record_event; + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + graph_capturing_stream_set_.insert(stream); + return; } - -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); #endif - VLOG(8) << "Record event " << record_event << " to stream " << stream; + + RecordStreamWithNoGraphCapturing(stream); + RecordGraphCapturingStreams(); } bool StreamSafeCUDAAllocation::CanBeFreed() { - // NOTE(Ruibiao): This function will not execute concurrently, - // so outstanding_event_lock_ is not required here +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { + return graph_capturing_stream_set_.empty() && + outstanding_event_map_.empty(); + } +#endif + + RecordGraphCapturingStreams(); + for (auto it = outstanding_event_map_.begin(); it != outstanding_event_map_.end(); ++it) { gpuEvent_t& event = it->second; @@ -98,21 +94,62 @@ const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const { return owning_stream_; } +void StreamSafeCUDAAllocation::RecordGraphCapturingStreams() { + for (gpuStream_t stream : graph_capturing_stream_set_) { + RecordStreamWithNoGraphCapturing(stream); + } + graph_capturing_stream_set_.clear(); +} + +void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing( + const gpuStream_t& stream) { + gpuEvent_t record_event; + auto it = outstanding_event_map_.find(stream); + if (it == outstanding_event_map_.end()) { + gpuEvent_t new_event; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS( + cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + hipEventCreateWithFlags(&new_event, hipEventDisableTiming)); +#endif + outstanding_event_map_[stream] = new_event; + record_event = new_event; + VLOG(9) << "Create a new event " << new_event; + } else { + record_event = it->second; + VLOG(9) << "Reuse event " << record_event; + } + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream)); +#endif + VLOG(8) << "Record event " << record_event << " to stream " << stream; +} + StreamSafeCUDAAllocator::StreamSafeCUDAAllocator( std::shared_ptr underlying_allocator, platform::CUDAPlace place, - gpuStream_t default_stream) + gpuStream_t default_stream, bool in_cuda_graph_capturing) : underlying_allocator_(std::move(underlying_allocator)), place_(std::move(place)), - default_stream_(std::move(default_stream)) { - std::lock_guard lock_guard(allocator_map_lock_); - allocator_map_[place].emplace_back(this); + default_stream_(std::move(default_stream)), + in_cuda_graph_capturing_(in_cuda_graph_capturing) { + if (LIKELY(!in_cuda_graph_capturing)) { + std::lock_guard lock_guard(allocator_map_lock_); + allocator_map_[place].emplace_back(this); + } } StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { - std::lock_guard lock_guard(allocator_map_lock_); - std::vector& allocators = allocator_map_[place_]; - allocators.erase(std::remove(allocators.begin(), allocators.end(), this), - allocators.end()); + if (LIKELY(!in_cuda_graph_capturing_)) { + std::lock_guard lock_guard(allocator_map_lock_); + std::vector& allocators = allocator_map_[place_]; + allocators.erase(std::remove(allocators.begin(), allocators.end(), this), + allocators.end()); + } } bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } @@ -140,7 +177,7 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { } StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation( static_unique_ptr_cast(std::move(underlying_allocation)), - default_stream_); + default_stream_, this); VLOG(8) << "Allocate " << allocation->size() << " bytes at address " << allocation->ptr(); return allocation; @@ -157,22 +194,27 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { "StreamSafeCUDAAllocation*", allocation)); VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); - std::lock_guard lock_guard(unfreed_allocation_lock_); if (stream_safe_cuda_allocation->CanBeFreed()) { VLOG(9) << "Directly delete allocation"; delete stream_safe_cuda_allocation; } else { VLOG(9) << "Put into unfreed_allocation list"; + std::lock_guard lock_guard(unfreed_allocation_lock_); unfreed_allocations_.emplace_back(stream_safe_cuda_allocation); } } uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { + if (UNLIKELY(in_cuda_graph_capturing_)) { + VLOG(7) << "Memory release forbidden in CUDA Graph Captruing"; + return 0; + } + std::lock_guard lock_guard(allocator_map_lock_); std::vector& allocators = allocator_map_[place]; uint64_t released_size = 0; for (StreamSafeCUDAAllocator* allocator : allocators) { - released_size += allocator->ProcessUnfreedAllocationsWithRelease(); + released_size += allocator->ProcessUnfreedAllocationsAndRelease(); } VLOG(8) << "Release " << released_size << " bytes memory from all streams"; return released_size; @@ -191,7 +233,7 @@ void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { } } -uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() { +uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsAndRelease() { ProcessUnfreedAllocations(); return underlying_allocator_->Release(place_); } diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h index 7354836308cfba0338fb2e146cc14182006876ee..ecddff97c206be968148e32ddf3f9c6623bf8bde 100644 --- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h +++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h @@ -14,10 +14,9 @@ #pragma once -#include #include #include -#include +#include #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/spin_lock.h" #include "paddle/fluid/platform/place.h" @@ -32,27 +31,38 @@ namespace paddle { namespace memory { namespace allocation { +class StreamSafeCUDAAllocator; + class StreamSafeCUDAAllocation : public Allocation { public: StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation, - gpuStream_t owning_stream); + gpuStream_t owning_stream, + StreamSafeCUDAAllocator *allocator); + void RecordStream(const gpuStream_t &stream); bool CanBeFreed(); - const gpuStream_t &GetOwningStream() const; private: + void RecordGraphCapturingStreams(); + void RecordStreamWithNoGraphCapturing(const gpuStream_t &stream); DecoratedAllocationPtr underlying_allocation_; + std::set graph_capturing_stream_set_; std::map outstanding_event_map_; gpuStream_t owning_stream_; SpinLock outstanding_event_map_lock_; + // To compatiable with CUDA Graph, hold the allocator shared_ptr so that + // Allocator will not deconstruct before Allocation + std::shared_ptr allocator_; }; -class StreamSafeCUDAAllocator : public Allocator { +class StreamSafeCUDAAllocator + : public Allocator, + public std::enable_shared_from_this { public: StreamSafeCUDAAllocator(std::shared_ptr underlying_allocator, - platform::CUDAPlace place, - gpuStream_t default_stream); + platform::CUDAPlace place, gpuStream_t default_stream, + bool in_cuda_graph_capturing = false); ~StreamSafeCUDAAllocator(); bool IsAllocThreadSafe() const override; @@ -63,7 +73,7 @@ class StreamSafeCUDAAllocator : public Allocator { private: void ProcessUnfreedAllocations(); - uint64_t ProcessUnfreedAllocationsWithRelease(); + uint64_t ProcessUnfreedAllocationsAndRelease(); static std::map> allocator_map_; @@ -74,6 +84,8 @@ class StreamSafeCUDAAllocator : public Allocator { gpuStream_t default_stream_; std::list unfreed_allocations_; SpinLock unfreed_allocation_lock_; + + bool in_cuda_graph_capturing_; }; } // namespace allocation diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index d7bbfba932cb4a5aab01bc3e2d1276dbe6450b29..076a96139612168f6c3d5d039184ccdb7a536f2e 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -26,6 +26,7 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif #include "paddle/fluid/platform/device/device_wrapper.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { @@ -43,11 +44,11 @@ BuddyAllocator::BuddyAllocator( #ifdef PADDLE_WITH_CUSTOM_DEVICE if (!dev_type.empty()) { init_allocate_size_func_ = [dev_type]() { - return platform::DeviceManager::GetInitAllocSize( + return phi::DeviceManager::GetInitAllocSize( platform::PlaceHelper::CreatePlace(dev_type)); }; re_allocate_size_func_ = [dev_type]() { - return platform::DeviceManager::GetReallocSize( + return phi::DeviceManager::GetReallocSize( platform::PlaceHelper::CreatePlace(dev_type)); }; } else { diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index a61f98c4e1a22adcc3684a9e5af190a82e3b5110..37ac0b4483291c8c3a3eeb31883c55c7eda24dc8 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -438,7 +438,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { void* p; auto place = platform::CustomPlace(dev_type_, dev_id_); - auto device = platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); p = device->MemoryAllocate(size); if (LIKELY(p)) { VLOG(4) << "CustomAllocator::Alloc " << p << " size " << size; @@ -447,7 +447,7 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { } else { size_t avail, total; - platform::DeviceManager::MemoryStats(place, &total, &avail); + phi::DeviceManager::MemoryStats(place, &total, &avail); PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on %s %d. " "total memory is %s, used memory is %s, " @@ -470,7 +470,7 @@ void CustomAllocator::Free(void* p, size_t size, size_t index) { size, plug_alloc_size)); plug_alloc_size -= size; auto place = platform::CustomPlace(dev_type_, dev_id_); - auto device = platform::DeviceManager::GetDeviceWithPlace(place); + auto device = phi::DeviceManager::GetDeviceWithPlace(place); device->MemoryDeallocate(p, size); } diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index b60bb4fc1d1bb5e4366625277db8fdb968474891..2bca2c388a05958fda0e891190dcf7e7ddc53b0c 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -41,6 +41,11 @@ std::shared_ptr AllocShared(const platform::Place& place, stream); } +AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream) { + return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); +} + bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream) { return allocation::AllocatorFacade::Instance().InSameStream(allocation, @@ -52,11 +57,6 @@ void* GetBasePtr(const std::shared_ptr& allocation) { } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream) { - return allocation::AllocatorFacade::Instance().Alloc(place, size, stream); -} - uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream) { return allocation::AllocatorFacade::Instance().Release(place, stream); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 89b4caa5bed26fa9b8d0bf09df702f17a310dff6..601fe3f2a42c391c602887bacccae97125b951e1 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -41,15 +41,15 @@ extern std::shared_ptr AllocShared(const platform::Place& place, size_t size, const phi::Stream& stream); +extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, + const phi::Stream& stream); + extern bool InSameStream(const std::shared_ptr& allocation, const phi::Stream& stream); extern void* GetBasePtr(const std::shared_ptr& allocation); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size, - const gpuStream_t& stream); - extern uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 166cdd0b5d6b6a523cfe470662951184ebbfabc5..3198b4f8d935e3815ba94db945a24ab4df4ca97b 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -44,9 +44,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << ", stream=" << stream; - platform::DeviceManager::SetDevice(src_place); - platform::stream::Stream stream_wrapper(src_place, stream); - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( + phi::DeviceManager::SetDevice(src_place); + phi::stream::Stream stream_wrapper(src_place, stream); + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2H( dst, src, num, &stream_wrapper); } @@ -62,9 +62,9 @@ void Copy( VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << ", stream=" << stream; - platform::DeviceManager::SetDevice(dst_place); - platform::stream::Stream stream_wrapper(dst_place, stream); - platform::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( + phi::DeviceManager::SetDevice(dst_place); + phi::stream::Stream stream_wrapper(dst_place, stream); + phi::DeviceManager::GetDeviceWithPlace(dst_place)->MemoryCopyH2D( dst, src, num, &stream_wrapper); } @@ -82,16 +82,16 @@ void Copy( << dst_place << ", stream=" << stream; if (src_type == dst_type) { - platform::DeviceManager::SetDevice(src_place); - platform::stream::Stream stream_wrapper(src_place, stream); + phi::DeviceManager::SetDevice(src_place); + phi::stream::Stream stream_wrapper(src_place, stream); auto src_id = platform::PlaceHelper::GetDeviceId(src_place); auto dst_id = platform::PlaceHelper::GetDeviceId(dst_place); if (src_id == dst_id) { - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyD2D( dst, src, num, &stream_wrapper); } else { - platform::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( + phi::DeviceManager::GetDeviceWithPlace(src_place)->MemoryCopyP2P( dst_place, dst, src, num, &stream_wrapper); } } else { diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu index 933717f3090c4b25f912e0bbe87922a1494c128a..5e4a4234bb41663f2287203fa9123029e6894036 100644 --- a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu +++ b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu @@ -12,34 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_CUDA -#include -#include -#endif - -#ifdef PADDLE_WITH_HIP -#include -#endif - #include // NOLINT #include #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/stream.h" +#ifdef PADDLE_WITH_CUDA +#include +#include +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#endif + +#ifdef PADDLE_WITH_HIP +#include +#endif + namespace paddle { namespace memory { -__global__ void add_kernel(int *x, int n) { +// y += (x + 1) +__global__ void add_kernel(int *x, int *y, int n) { int thread_num = gridDim.x * blockDim.x; int thread_id = blockIdx.x * blockDim.x + threadIdx.x; for (int i = thread_id; i < n; i += thread_num) { - atomicAdd(x + i, thread_id); + y[i] += x[i] + 1; } } @@ -51,153 +52,6 @@ void CheckMemLeak(const platform::CUDAPlace &place) { << " there may be a memory leak problem"; } -class StreamSafeCUDAAllocTest : public ::testing::Test { - protected: - void SetUp() override { - place_ = platform::CUDAPlace(); - stream_num_ = 64; - grid_num_ = 1; - block_num_ = 32; - data_num_ = 131072; - workspace_size_ = data_num_ * sizeof(int); - - // alloc workspace for each stream - for (size_t i = 0; i < stream_num_; ++i) { - gpuStream_t stream; -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); -#endif - - std::shared_ptr allocation = - AllocShared(place_, workspace_size_, - phi::Stream(reinterpret_cast(stream))); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemset(allocation->ptr(), 0, allocation->size())); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemset(allocation->ptr(), 0, allocation->size())); -#endif - - streams_.emplace_back(stream); - workspaces_.emplace_back(allocation); - } - - result_ = Alloc(place_, stream_num_ * workspace_size_); - } - - void SingleStreamRun(size_t idx) { - // for all stream i, - // stream idx lauch a kernel to add (j % thread_num) to workspaces_[i][j] - for (size_t i = 0; i < stream_num_; ++i) { - int *x = reinterpret_cast(workspaces_[i]->ptr()); - add_kernel<<>>(x, data_num_); - RecordStream(workspaces_[i], streams_[idx]); - } - } - - void CopyResultAsync() { - for (size_t i = 0; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, cudaMemcpyDeviceToDevice)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync( - reinterpret_cast(result_->ptr()) + i * data_num_, - workspaces_[i]->ptr(), workspace_size_, hipMemcpyDeviceToDevice)); -#endif - } - } - - void MultiStreamRun() { - for (size_t i = 0; i < stream_num_; ++i) { - SingleStreamRun(i); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void MultiThreadMUltiStreamRun() { - std::vector threads; - for (size_t i = 0; i < stream_num_; ++i) { - threads.push_back( - std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); - } - for (size_t i = 0; i < stream_num_; ++i) { - threads[i].join(); - } - CopyResultAsync(); - workspaces_.clear(); // fast_gc - cudaDeviceSynchronize(); - } - - void CheckResult() { - auto result_host = std::unique_ptr(new int[result_->size()]); -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(result_host.get(), result_->ptr(), - result_->size(), - cudaMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpy(result_host.get(), result_->ptr(), - result_->size(), - hipMemcpyDeviceToHost)); -#endif - size_t thread_num = grid_num_ * block_num_; - for (size_t i = 0; i < stream_num_; ++i) { - for (size_t j = 0; j < data_num_; ++j) { - EXPECT_TRUE(result_host[i * stream_num_ + j] == - (j % thread_num) * stream_num_); - } - } - result_.reset(); - } - - void TearDown() override { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize()); -#endif - for (gpuStream_t stream : streams_) { - Release(place_, stream); - } - - for (size_t i = 1; i < stream_num_; ++i) { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); -#else - PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); -#endif - } - - CheckMemLeak(place_); - } - - size_t stream_num_; - size_t grid_num_; - size_t block_num_; - size_t data_num_; - size_t workspace_size_; - platform::CUDAPlace place_; - std::vector streams_; - std::vector> workspaces_; - allocation::AllocationPtr result_; -}; - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { - MultiStreamRun(); - CheckResult(); -} - -TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { - MultiThreadMUltiStreamRun(); - CheckResult(); -} - TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { platform::CUDAPlace place = platform::CUDAPlace(); size_t alloc_size = 256; @@ -214,7 +68,8 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) { paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); allocation::AllocationPtr allocation_unique = - Alloc(place, alloc_size, default_stream); + Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(default_stream))); EXPECT_GE(allocation_unique->size(), alloc_size); EXPECT_EQ(allocation_unique->ptr(), address); allocation_unique.reset(); @@ -303,36 +158,6 @@ TEST(StreamSafeCUDAAllocInterfaceTest, GetStreamInterfaceTest) { CheckMemLeak(place); } -#ifdef PADDLE_WITH_CUDA -TEST(StreamSafeCUDAAllocInterfaceTest, CUDAGraphExceptionTest) { - platform::CUDAPlace place = platform::CUDAPlace(); - size_t alloc_size = 1; - std::shared_ptr allocation = AllocShared(place, alloc_size); - - platform::BeginCUDAGraphCapture(place, cudaStreamCaptureModeGlobal); - EXPECT_THROW(AllocShared(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size), paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place), paddle::platform::EnforceNotMet); - EXPECT_THROW(allocation::AllocatorFacade::Instance().GetAllocator(place), - paddle::platform::EnforceNotMet); - EXPECT_THROW( - AllocShared(place, alloc_size, - phi::Stream(reinterpret_cast(nullptr))), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Alloc(place, alloc_size, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(Release(place, nullptr), paddle::platform::EnforceNotMet); - EXPECT_THROW(RecordStream(allocation, nullptr), - paddle::platform::EnforceNotMet); - EXPECT_THROW(GetStream(allocation), paddle::platform::EnforceNotMet); - platform::EndCUDAGraphCapture(); - - allocation.reset(); - Release(place); - CheckMemLeak(place); -} -#endif - TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { platform::CUDAPlace place = platform::CUDAPlace(); gpuStream_t stream1, stream2; @@ -348,12 +173,14 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { // so the second alloc will fail and retry size_t alloc_size = available_size / 4 * 3; - allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1); + allocation::AllocationPtr allocation1 = Alloc( + place, alloc_size, phi::Stream(reinterpret_cast(stream1))); allocation::AllocationPtr allocation2; std::thread th([&allocation2, &place, &stream2, alloc_size]() { std::this_thread::sleep_for(std::chrono::seconds(1)); - allocation2 = Alloc(place, alloc_size, stream2); + allocation2 = Alloc(place, alloc_size, + phi::Stream(reinterpret_cast(stream2))); }); allocation1.reset(); // free but not release th.join(); @@ -371,5 +198,201 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) { CheckMemLeak(place); } +class StreamSafeCUDAAllocTest : public ::testing::Test { + protected: + void SetUp() override { + place_ = platform::CUDAPlace(); + stream_num_ = 64; + grid_num_ = 1; + block_num_ = 32; + data_num_ = 131072; + workspace_size_ = data_num_ * sizeof(int); + + for (size_t i = 0; i < stream_num_; ++i) { + gpuStream_t stream; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream)); +#endif + + std::shared_ptr workspace_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr result_allocation = + AllocShared(place_, workspace_size_, + phi::Stream(reinterpret_cast(stream))); + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemset(result_allocation->ptr(), 0, result_allocation->size())); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipMemset(workspace_allocation->ptr(), 0, + workspace_allocation->size())); + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemset(result_allocation->ptr(), 0, result_allocation->size())); +#endif + + streams_.emplace_back(stream); + workspaces_.emplace_back(workspace_allocation); + results_.emplace_back(result_allocation); + host_results_.emplace_back(host_result_allocation); + } + } + + void SingleStreamRun(size_t idx) { + int *y = reinterpret_cast(results_[idx]->ptr()); + int neighbouring_idx = idx > 0 ? idx - 1 : idx; + + add_kernel<<>>( + reinterpret_cast(workspaces_[idx]->ptr()), y, data_num_); + add_kernel<<>>( + reinterpret_cast(workspaces_[neighbouring_idx]->ptr()), y, + data_num_); + RecordStream(workspaces_[neighbouring_idx], streams_[idx]); + } + + void MultiStreamRun() { + // Must run in reverse order, or the workspace_[i - 1] will be released + // before streams_[i]'s kernel launch + for (int i = stream_num_ - 1; i >= 0; --i) { + SingleStreamRun(i); + workspaces_[i].reset(); // fast GC + } + } + + void MultiThreadMultiStreamRun() { + std::vector threads; + for (size_t i = 0; i < stream_num_; ++i) { + threads.push_back( + std::thread(&StreamSafeCUDAAllocTest::SingleStreamRun, this, i)); + } + for (size_t i = 0; i < stream_num_; ++i) { + threads[i].join(); + } + workspaces_.clear(); + } + + void CUDAGraphRun() { + testing_cuda_graph_ = true; + platform::BeginCUDAGraphCapture(platform::CUDAPlace(), + cudaStreamCaptureModeGlobal); + + std::shared_ptr data_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + std::shared_ptr result_allocation = + AllocShared(platform::CUDAPlace(), workspace_size_); + + int *data = static_cast(data_allocation->ptr()); + int *result = static_cast(result_allocation->ptr()); + + gpuStream_t main_stream = GetStream(data_allocation); + gpuStream_t other_stream; + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&other_stream)); + + add_kernel<<>>(data, result, + data_num_); + RecordStream(data_allocation, other_stream); + + std::unique_ptr cuda_graph = + platform::EndCUDAGraphCapture(); + + int replay_times = 10; + for (int i = 0; i < replay_times; ++i) { + cuda_graph->Replay(); + } + + std::shared_ptr host_result_allocation = + AllocShared(platform::CPUPlace(), workspace_size_); + Copy(host_result_allocation->place(), host_result_allocation->ptr(), + result_allocation->place(), result_allocation->ptr(), workspace_size_, + main_stream); + cudaStreamSynchronize(main_stream); + + int *host_result = static_cast(host_result_allocation->ptr()); + for (int i = 0; i < data_num_; ++i) { + EXPECT_EQ(host_result[i], replay_times); + } + + data_allocation.reset(); + result_allocation.reset(); + cuda_graph.release(); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(other_stream)); + } + + void CheckResult() { + for (size_t i = 0; i < stream_num_; ++i) { + Copy(host_results_[i]->place(), host_results_[i]->ptr(), + results_[i]->place(), results_[i]->ptr(), workspace_size_, + streams_[i]); + } + cudaDeviceSynchronize(); + + size_t thread_num = grid_num_ * block_num_; + for (size_t i = 0; i < stream_num_; ++i) { + int *result = static_cast(host_results_[i]->ptr()); + for (size_t j = 0; j < data_num_; ++j) { + EXPECT_EQ(result[j], 2); + } + } + } + + void TearDown() override { + workspaces_.clear(); + results_.clear(); + host_results_.clear(); + for (gpuStream_t stream : streams_) { + Release(place_, stream); + } + + for (size_t i = 0; i < stream_num_; ++i) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(streams_[i])); +#else + PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(streams_[i])); +#endif + } + + // Memory release for CUDA Graph memory pool is forbidden + if (!testing_cuda_graph_) { + CheckMemLeak(place_); + } + } + + bool testing_cuda_graph_{0}; + size_t stream_num_; + size_t grid_num_; + size_t block_num_; + size_t data_num_; + size_t workspace_size_; + platform::CUDAPlace place_; + std::vector streams_; + std::vector> workspaces_; + std::vector> results_; + std::vector> host_results_; +}; + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) { + MultiStreamRun(); + CheckResult(); +} + +TEST_F(StreamSafeCUDAAllocTest, CUDAMutilThreadMutilStreamTest) { + MultiThreadMultiStreamRun(); + CheckResult(); +} + +#ifdef PADDLE_WITH_CUDA +TEST_F(StreamSafeCUDAAllocTest, CUDAGraphTest) { + MultiStreamRun(); + CUDAGraphRun(); + CheckResult(); +} +#endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 91a0352e1915e95378012aa398ff996cbc10f216..e77be832c0cc8975c3fc2ebb7fad577cdfe919f5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function lod_tensor maxouting unpooling pooling lod_rank_table context_project -sequence_pooling segment_pooling executor device_memory_aligment generator) +sequence_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve) diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index c28026a4bd43aac5b0c447e24a164e27233076e8..e1460629fb18a4259731c2c9de4ed8f623b5a1e4 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -141,8 +141,8 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 0ac29e6d3ada7335cab510ef82c9f46d2da7eb05..b4a97e24cf29233776b19aa0ea7764a00435f6fc 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { : CudnnActivationGradFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -197,7 +205,8 @@ class CudnnActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out."); + static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut, + "Forward deps must be Out."); const framework::Tensor *X, *Out, *dOut; X = Out = dOut = nullptr; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 73d65b7c6e7e0a5be2d680afba971d54b492c05d..4205f2253a652ccc5f6d4886df1b1194f5e5062f 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -34,7 +34,8 @@ using paddle::framework::Tensor; template static constexpr bool CanInplaceAct() { - return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; + return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut || + GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps; } #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ @@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DOut")) { ctx->ShareDim("Out", "DOut"); ctx->ShareLoD("Out", "DOut"); @@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("X", "DDOut"); ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); @@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("D_DOut")) { ctx->ShareDim("Out", "D_DOut"); ctx->ShareLoD("Out", "D_DOut"); @@ -1464,6 +1471,21 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor) +REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor); +REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor); +REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor); +REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor); +REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor); +REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor); +REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); +REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); +REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); +REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); +REGISTER_ACTIVATION_OP(brelu, BRelu, BReluFunctor, BReluGradFunctor); +REGISTER_ACTIVATION_OP(thresholded_relu, ThresholdedRelu, + ThresholdedReluFunctor, ThresholdedReluGradFunctor); + /* ========================== sigmoid register ============================= */ // 1. Register Sigmoid Operator @@ -1548,23 +1570,6 @@ REGISTER_OPERATOR( ops::ActivationOpTripleGrad::FwdDeps()>, ops::ActivationTripleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); -REGISTER_OP_CPU_KERNEL( - tanh_grad_grad, ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); -// Register TripleGrad Kernel -REGISTER_OP_CPU_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ @@ -1584,16 +1589,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor); - -REGISTER_OP_CPU_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); /* ========================================================================== */ /* ======================== leaky relu register ============================ */ @@ -1614,16 +1609,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); -REGISTER_OP_CPU_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor>); /* ========================================================================== */ /* ======================== elu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index ff41da86f7bb6ba8406d58804888b5dcd8bc3be0..b076db01c22c62b17fdd85b7208467eea1375fed 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -35,16 +35,14 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { using framework::To32BitIndex; -enum ActBwdOpFwdDeps { - kNoDeps = 0x00, // Do not need any forward input/output - kDepX = 0x01, // Only need forward input X - kDepOut = 0x02, // Only need forward output Out -}; +using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps; /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. @@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor( auto x_grad_var = context.OutputVar(framework::GradVarName("X")); const framework::Variable* out_var = nullptr; - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { out_var = context.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( @@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor( "Output(Out), variable name = %s", context.OutputName(framework::GradVarName("X")))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = context.InputVar("X"); PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound( "Cannot get the tensor from the " @@ -248,6 +247,39 @@ struct SigmoidFunctor : public BaseActivationFunctor { } }; +#define USE_PHI_FUNCTOR(name) \ + template \ + using name##Functor = phi::funcs::name##Functor; \ + template \ + using name##GradFunctor = phi::funcs::name##GradFunctor; + +#define USE_PHI_DOUBLE_GRAD_FUNCTOR(name) \ + template \ + using name##GradGradFunctor = phi::funcs::name##GradGradFunctor; + +#define USE_PHI_TRIPLE_GRAD_FUNCTOR(name) \ + template \ + using name##TripleGradFunctor = phi::funcs::name##TripleGradFunctor; + +USE_PHI_FUNCTOR(Cos) +USE_PHI_FUNCTOR(Tan) +USE_PHI_FUNCTOR(Acos) +USE_PHI_FUNCTOR(Sin) +USE_PHI_FUNCTOR(Asin) +USE_PHI_FUNCTOR(Atan) +USE_PHI_FUNCTOR(Sinh) +USE_PHI_FUNCTOR(Cosh) +USE_PHI_FUNCTOR(Asinh) +USE_PHI_FUNCTOR(Acosh) +USE_PHI_FUNCTOR(Atanh) +USE_PHI_FUNCTOR(Tanh) +USE_PHI_DOUBLE_GRAD_FUNCTOR(Tanh) +USE_PHI_TRIPLE_GRAD_FUNCTOR(Tanh) +USE_PHI_FUNCTOR(BRelu) +USE_PHI_FUNCTOR(ThresholdedRelu) +USE_PHI_FUNCTOR(LeakyRelu) +USE_PHI_DOUBLE_GRAD_FUNCTOR(LeakyRelu) + template struct SigmoidGradFunctor : public BaseActivationFunctor { template { dx.device(d) = dout * out * (static_cast(1) - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -293,7 +327,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -351,7 +387,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // silu(x) = x / (1 + exp(-x)) @@ -376,7 +414,7 @@ struct SiluGradFunctor : public BaseActivationFunctor { (static_cast(1) + (temp2 / temp1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // Originally: logsigmoid(x) = -log (1 + exp(-x)) @@ -414,7 +452,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor { dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // exp(x) = e^x @@ -434,7 +472,9 @@ struct ExpGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // expm1(x) = e^x - 1 @@ -454,143 +494,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // relu(x) = max(x, 0) -template -struct ReluCPUFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { - return v > static_cast(0) ? v : static_cast(0); - }); - } -}; template -struct ReluCUDAFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.cwiseMax(static_cast(0)); - } -}; - +using ReluCPUFunctor = phi::funcs::ReluCPUFunctor; template -struct ReluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (out > static_cast(0)).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; +using ReluGradFunctor = phi::funcs::ReluGradFunctor; -// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template -struct TanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.tanh(); - } -}; +using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; template -struct TanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (static_cast(1) - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct TanhGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - framework::Tensor* dOutNew, framework::Tensor* ddOut) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhGradGrad")); - // tanh grad grad : ddout = (1 - out^2) * ddx, dout = - (dout_old * 2 * out - // * ddx) - if (dOutNew) { - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); - auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); - dout_new.device(*d) = - static_cast(-1) * dout * static_cast(2) * out * ddx; - } - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); - ddout.device(*d) = (static_cast(1) - out * out) * ddx; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; -/* - Out - DOut D_Dout - DDx -> TanhTripleGrad -> D_DDx - D_DDout d_OutNew - D_Dout_new - - D_Dout = (-2) * Out * DDx * D_Dout_new - D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new - D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new - - Out, DDX, DOut, D_DDOut, D_DOut_New // input - D_OutNew, D_DOut, D_DDx // output -*/ -template -struct TanhTripleGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* Out, - const framework::Tensor* ddX, const framework::Tensor* dOut, - const framework::Tensor* d_DDOut, - const framework::Tensor* d_dOut_New, - framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, - framework::Tensor* d_DDx) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); - auto dout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); - auto d_ddOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); - auto d_dOutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); - - if (d_Out_New) { - auto d_OutNew = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); - d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - - (static_cast(2) * dout * ddx * d_dOutNew); - } - if (d_d_Out) { - auto d_dOut = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); - d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; - } - if (d_DDx) { - auto d_ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); - d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - - static_cast(2) * out * dout * d_dOutNew; - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; +using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) @@ -610,7 +530,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x.tanh() * x.tanh()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // tanhshrink(x) = x - tanh(x) @@ -646,7 +566,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 || temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 @@ -682,7 +602,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // sqrt(x) = x^(1/2) @@ -702,7 +622,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0.5) * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // rsqrt(x) = x^(-1/2) @@ -722,7 +644,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(-0.5) * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // ceil(x) = ceiling(x) @@ -742,7 +666,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0) * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; + } }; // floor(x) = flooring(x) @@ -754,373 +680,6 @@ struct FloorFunctor : public BaseActivationFunctor { } }; -template -struct Sine { - HOSTDEVICE T operator()(const T& val) const { return sin(val); } -}; - -template <> -struct Sine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sin(static_cast(val))); - } -}; - -template -struct Cosine { - HOSTDEVICE T operator()(const T& val) const { return cos(val); } -}; - -template <> -struct Cosine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(cos(static_cast(val))); - } -}; - -// cosine'(x) = -sin(x) -template -struct CosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = -dout * x.unaryExpr(Sine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosine(x) = cos(x) -template -struct CosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosine()); - } -}; - -// sine'(x) = cos(x) -template -struct SinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// sine(x) = sin(x) -template -struct SinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sine()); - } -}; - -template -struct Tangent { - HOSTDEVICE T operator()(const T& val) const { return tan(val); } -}; - -template <> -struct Tangent { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(tan(static_cast(val))); - } -}; - -// Tangent'(x) = -Tangent(x) -template -struct TanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout / x.unaryExpr(Cosine()).square(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// Tangent(x) = tan(x) -template -struct TanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Tangent()); - } -}; - -template -struct Sinh { - HOSTDEVICE T operator()(const T& val) const { return sinh(val); } -}; - -template <> -struct Sinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sinhf(static_cast(val))); - } -}; - -template -struct Cosh { - HOSTDEVICE T operator()(const T& val) const { return cosh(val); } -}; - -template <> -struct Cosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(coshf(static_cast(val))); - } -}; - -// sinh(x) = sinh(x) -template -struct SinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sinh()); - } -}; - -// cosh(x) = cosh(x) -template -struct CoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosh()); - } -}; - -// sinh'(x) = cosh(x) -template -struct SinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosh'(x) = sinh(x) -template -struct CoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Sinh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acos { - HOSTDEVICE T operator()(const T& val) const { return acos(val); } -}; - -template <> -struct Acos { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acos(static_cast(val))); - } -}; - -// Acos(x) = acos(x) -template -struct AcosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acos()); - } -}; - -// acos'(x) = -1/sqrt(1-x^2) -template -struct AcosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asin { - HOSTDEVICE T operator()(const T& val) const { return asin(val); } -}; - -template <> -struct Asin { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asin(static_cast(val))); - } -}; - -// Asin(x) = asin(x) -template -struct AsinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asin()); - } -}; - -// asin'(x) = 1/sqrt(1-x^2) -template -struct AsinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atan { - HOSTDEVICE T operator()(const T& val) const { return atan(val); } -}; - -template <> -struct Atan { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atan(static_cast(val))); - } -}; - -// Atan(x) = atan(x) -template -struct AtanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atan()); - } -}; - -// atan'(x) = 1 / (1 + x^2) -template -struct AtanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acosh { - HOSTDEVICE T operator()(const T& val) const { return acosh(val); } -}; - -template <> -struct Acosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acosh(static_cast(val))); - } -}; - -// Acosh(x) = acosh(x) -template -struct AcoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acosh()); - } -}; - -// acosh'(x) = 1/sqrt(x^2 - 1) -template -struct AcoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asinh { - HOSTDEVICE T operator()(const T& val) const { return asinh(val); } -}; - -template <> -struct Asinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asinh(static_cast(val))); - } -}; - -// Asinh(x) = asinh(x) -template -struct AsinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asinh()); - } -}; - -// asinh'(x) = 1/sqrt(x^2 + 1) -template -struct AsinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atanh { - HOSTDEVICE T operator()(const T& val) const { return atanh(val); } -}; - -template <> -struct Atanh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atanh(static_cast(val))); - } -}; - -// Atanh(x) = atanh(x) -template -struct AtanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atanh()); - } -}; - -// atanh'(x) = 1/(1 - x^2) -template -struct AtanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1147,7 +706,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(-1) * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // log(x) = natural logarithm of x @@ -1167,7 +728,7 @@ struct LogGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log2(x) = logarithm to the base 2 of the elements of x @@ -1188,7 +749,7 @@ struct Log2GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log10(x) = logarithm to the base 10 of the elements of x @@ -1209,7 +770,7 @@ struct Log10GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(10))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log1p(x) = natural logarithm of x+1 @@ -1229,7 +790,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / (x + static_cast(1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // square(x) = x^2 @@ -1249,43 +810,7 @@ struct SquareGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(2) * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct BReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` - // not polymorphism for speed. - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - template - void operator()(Device d, X x, Out out) const { - out.device(d) = - x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); - } -}; - -template -struct BReluGradFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * - ((x > static_cast(t_min)) * (x < static_cast(t_max))) - .template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // relu6(x) = min(max(0, x), 6) @@ -1319,7 +844,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // HardSwish = min(max(0, x+3), 6) * x / 6 @@ -1364,7 +891,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor { static_cast(1) * (static_cast(1) - tmp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // For numerical stability, using the following formula instead of softplus(x) = @@ -1409,7 +936,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor { .select(dout, dout / (static_cast(1) + (-x_beta).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // mish(x) = x * tanh(softplus(x)) @@ -1449,7 +976,7 @@ struct MishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softsign(x) = x / (1 + |x|) @@ -1472,7 +999,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor { dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1504,42 +1031,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct LeakyReluFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - template - void operator()(Device d, X x, Out out) const { - if (alpha < 1.f) { - out.device(d) = x.cwiseMax(static_cast(alpha) * x); - } else { - out.device(d) = x.cwiseMin(static_cast(alpha) * x); - } - } -}; - -template -struct LeakyReluGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; } - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = - static_cast(alpha) * (x < static_cast(0)).template cast(); - auto temp2 = (x >= static_cast(0)).template cast(); - dx.device(d) = dout * (temp1 + temp2).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template @@ -1573,7 +1067,7 @@ struct ELUGradFunctor : public BaseActivationFunctor { .select(dout, dout * (out + static_cast(alpha))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1592,7 +1086,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { .select(dout, dout * static_cast(alpha) * x.exp()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1672,7 +1166,7 @@ struct CELUGradFunctor : public BaseActivationFunctor { dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 @@ -1701,7 +1195,7 @@ struct PowGradFunctor : public BaseActivationFunctor { x.pow(static_cast(factor) - static_cast(1)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1766,38 +1260,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * a * b * (static_cast(1) - temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct ThresholdedReluFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out) const { - auto th = static_cast(threshold); - out.device(d) = (x > th).template cast() * x; - } -}; - -template -struct ThresholdedReluGradFunctor : public BaseActivationFunctor { - float threshold; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto th = static_cast(threshold); - dx.device(d) = dout * (x > th).template cast(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1832,7 +1295,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { static_cast(slope); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1865,121 +1330,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * ((static_cast(beta) * out) + temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -/* - * in arguments: x, out, ddx - * out arguments: ddout, dout, dx - */ -template -inline void ExtractActivationDoubleGradTensor( - const framework::ExecutionContext& ctx, const framework::Tensor** X, - const framework::Tensor** Out, const framework::Tensor** ddX, - framework::Tensor** dX, framework::Tensor** dOut, - framework::Tensor** ddOut) { - auto ddx_var = ctx.InputVar("DDX"); - auto ddo_var = ctx.OutputVar("DDOut"); - PADDLE_ENFORCE_NOT_NULL( - ddx_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("DDX"))); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var); - if (ddo_var) { - *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - ddo_var); - } - } else { - *ddX = ctx.Input("DDX"); - if (ddo_var) { - *ddOut = ctx.Output("DDOut"); - } - } - PADDLE_ENFORCE_NOT_NULL( - *ddX, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Output, variable name = %s", - ctx.OutputName("DDX"))); - - if (static_cast(kDepValue) & static_cast(kDepX)) { - auto x_var = ctx.InputVar("X"); - PADDLE_ENFORCE_NOT_NULL( - x_var, platform::errors::NotFound( - "Cannot get input Variable Out, variable name = %s", - ctx.InputName("X"))); - auto dx_var = ctx.OutputVar("DX"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); - if (dx_var) { - *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dx_var); - } - } else { - *X = ctx.Input("X"); - if (dx_var) { - *dX = ctx.Output("DX"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *X = *ddX; - } - if (static_cast(kDepValue) & static_cast(kDepOut)) { - auto out_var = ctx.InputVar("Out"); - PADDLE_ENFORCE_NOT_NULL( - out_var, - platform::errors::NotFound( - "Cannot get the tensor from the Variable Out, variable name = %s", - ctx.InputName("Out"))); - auto dout_var = ctx.OutputVar("DOut"); - if (CanBeUsedBySelectedRows.count(ctx.Type())) { - *Out = - paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); - if (dout_var) { - *dOut = - paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( - dout_var); - } - } else { - *Out = ctx.Input("Out"); - if (dout_var) { - *dOut = ctx.Output("DOut"); - } - } - } else { - VLOG(10) << "Inplace activation of Op: " << ctx.Type(); - *Out = *ddX; - } -} - -template -class ActivationDoubleGradKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Tensor *X, *Out, *ddX; - X = Out = ddX = nullptr; - framework::Tensor *ddOut, *dOut, *dX; - ddOut = dOut = dX = nullptr; - - ExtractActivationDoubleGradTensor(ctx, &X, &Out, &ddX, - &dX, &dOut, &ddOut); - - if (ddOut) ddOut->mutable_data(ctx.GetPlace()); - if (dOut) dOut->mutable_data(ctx.GetPlace()); - if (dX) dX->mutable_data(Out->dims(), ctx.GetPlace()); - - auto& place = ctx.template device_context(); - - Functor functor; - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = ctx.Attr(attr.first); - } - functor(place, X, Out, ddX, ddOut, dOut, dX); - } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2000,57 +1351,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * x.sign(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct ReluGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); - ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct LeakyReluGradGradFunctor : public BaseActivationFunctor { - float alpha; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - if (ddOut) { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); - auto x = framework::EigenVector::Flatten( - GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad")); - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); - ddout.device(*d) = - ddx * - ((x > static_cast(0)).template cast() + - static_cast(alpha) * (x <= static_cast(0)).template cast()) - .template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2088,7 +1389,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2127,7 +1428,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2156,7 +1457,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(0.5) / out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2185,7 +1488,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2214,7 +1519,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(2) * x; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need @@ -2840,7 +2145,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; } // namespace operators @@ -2849,26 +2154,14 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(cos, Cos, CosFunctor, CosGradFunctor); \ - __macro(tan, Tan, TanFunctor, TanGradFunctor); \ - __macro(acos, Acos, AcosFunctor, AcosGradFunctor); \ - __macro(sin, Sin, SinFunctor, SinGradFunctor); \ - __macro(asin, Asin, AsinFunctor, AsinGradFunctor); \ - __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor); \ - __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ - __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); \ - __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); \ - __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ __macro(log2, Log2, Log2Functor, Log2GradFunctor); \ __macro(log10, Log10, Log10Functor, Log10GradFunctor); \ - __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ @@ -2879,7 +2172,5 @@ struct LogGradGradFunctor : public BaseActivationFunctor { __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ HardSigmoidGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ - __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \ - ThresholdedReluGradFunctor); \ __macro(mish, Mish, MishFunctor, MishGradFunctor); \ __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index e1afb3919f813b756e228e37413166ad3f95d6df..256f20db08445e8b8d5933aa0e3151f69fcb5b10 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,60 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // relu(x) = max(x, 0) - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : zero; - } -}; - -template -struct CudaReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // dx = dout * (out > 0) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return out > zero ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct CudaLeakyReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // leakyrelu(x) = x > 0 ? x : alpha * x - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : static_cast(alpha) * x; - } -}; - -template -struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float alpha; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha}}; - } - - // dx = dout * (x > 0 ? 1 : alpha) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > zero ? dout : static_cast(alpha) * dout; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - template struct CudaSigmoidFunctor : public BaseActivationFunctor { using MPType = typename details::MPTypeTrait::Type; @@ -93,7 +39,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { return dout * out * (one - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -122,7 +70,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp * (one + x * (one - temp)))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -159,30 +107,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // atan(x) = atan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atan(x)); - } -}; - -template -struct CudaAtanGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + x^2) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return dout / (one + x * x); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -219,7 +144,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { return (x >= -l && x <= l) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -262,295 +187,11 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { return static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } -}; - -template -struct CudaCosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cos(x) = cos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cos(x)); - } -}; - -template -struct CudaCosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * (-sin(x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout * sin(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sin(x) = sin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sin(x)); - } -}; - -template -struct CudaSinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cos(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cos(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaTanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tan(x) = tan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tan(x)); - } -}; - -template -struct CudaTanGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout / cos(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / (cos(x) * cos(x))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // asin(x) = asin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asin(x)); - } -}; - -template -struct CudaAsinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAcosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // acos(x) = acos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acos(x)); - } -}; - -template -struct CudaAcosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = -dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaCoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cosh(x) = cosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cosh(x)); - } -}; - -template -struct CudaCoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * sinh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * sinh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sinh(x) = sinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sinh(x)); - } -}; - -template -struct CudaSinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cosh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cosh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaTanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tanh(x) = tanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tanh(x)); - } -}; - -template -struct CudaTanhGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout * (1 - out^2) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return dout * (one - out * out); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct CudaAcoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Acosh(x) = acosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acosh(x)); - } -}; - -template -struct CudaAcoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1 / sqrt(x^2 - 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x - one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Asinh(x) = asinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asinh(x)); - } -}; - -template -struct CudaAsinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * 1/sqrt(x^2 + 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x + one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Atanh(x) = atanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atanh(x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; } }; -template -struct CudaAtanhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1/(1- x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / (one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -566,7 +207,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor { return -dout * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -587,7 +230,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor { return dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -608,7 +253,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor { return dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -629,7 +276,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { return dout / x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -647,7 +294,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { return dout * two * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -670,7 +317,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { return one_half * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -693,7 +342,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor { return minus_one_half * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -717,7 +368,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { return dout / (one + x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -741,7 +392,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { return dout / (x * log_two); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -765,46 +416,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { return dout / (x * log_ten); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaBReluFunctor : public BaseActivationFunctor { - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // brelu(x) = min(max(x, t_min), t_max) - __device__ __forceinline__ T operator()(const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - T temp_max = x > t_min_cast ? x : t_min_cast; - T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; - return temp_min; - } -}; - -template -struct CudaBReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float t_min; - float t_max; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"t_min", &t_min}, {"t_max", &t_max}}; - } - - // dx = (x > t_min && x < t_max) ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - T t_min_cast = static_cast(t_min); - T t_max_cast = static_cast(t_max); - return (x > t_min_cast && x < t_max_cast) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -849,7 +461,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { : static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -893,7 +507,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { return static_cast(dout * a * b * (one - temp * temp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -939,7 +553,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -962,7 +576,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { return dout / (temp * temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -996,7 +610,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { return (out > zero && out < t) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1022,7 +638,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { return static_cast(dout * tanh(x) * tanh(x)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1056,7 +672,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { return (x > -t && x < t) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1097,7 +713,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { return (out > zero && out < one) ? dout * static_cast(slope) : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1141,7 +759,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 + temp3)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1190,39 +808,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaThresholdedReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // thresholded_relu(x) = x > threshold ? x : 0 - __device__ __forceinline__ T operator()(const T x) const { - return x > static_cast(threshold) ? x : zero; - } -}; - -template -struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - float threshold; - - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"threshold", &threshold}}; - } - - // dx = x > threshold ? dout : 0 - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return x > static_cast(threshold) ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1274,7 +860,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1320,7 +906,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { return static_cast(dout * (out_pos + out_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1347,7 +935,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { return static_cast(dout * (x_pos + x_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1429,7 +1017,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor { temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1477,13 +1065,14 @@ class ActivationGradCudaKernel std::vector ins = {d_out}; std::vector outs = {d_x}; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + if (static_cast(Functor::FwdDeps()) == + static_cast(ActBwdOpFwdDeps::kDepOut)) { // Only need forward output Out ins.push_back(out); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { + static_cast(ActBwdOpFwdDeps::kDepX)) { // Only need forward input X ins.push_back(x); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, @@ -1495,6 +1084,22 @@ class ActivationGradCudaKernel } }; +USE_PHI_FUNCTOR(CudaCos) +USE_PHI_FUNCTOR(CudaTan) +USE_PHI_FUNCTOR(CudaAcos) +USE_PHI_FUNCTOR(CudaSin) +USE_PHI_FUNCTOR(CudaAsin) +USE_PHI_FUNCTOR(CudaAtan) +USE_PHI_FUNCTOR(CudaSinh) +USE_PHI_FUNCTOR(CudaCosh) +USE_PHI_FUNCTOR(CudaAsinh) +USE_PHI_FUNCTOR(CudaAcosh) +USE_PHI_FUNCTOR(CudaAtanh) +USE_PHI_FUNCTOR(CudaTanh) +USE_PHI_FUNCTOR(CudaBRelu) +USE_PHI_FUNCTOR(CudaLeakyRelu) +USE_PHI_FUNCTOR(CudaThresholdedRelu) + } // namespace operators } // namespace paddle @@ -1509,7 +1114,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); #define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor, \ grad_functor) \ @@ -1531,7 +1140,9 @@ namespace plat = paddle::platform; ops::ActivationCudaKernel>, \ ops::ActivationCudaKernel>); \ + ops::functor>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ act_type##_grad, \ ops::ActivationGradCudaKernel>, \ ops::ActivationGradCudaKernel>); - -/* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, - CudaLeakyReluGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - leaky_relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel< - plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor>); -/* ========================================================================== */ + ops::grad_functor>, \ + ops::ActivationGradCudaKernel>); /* ======================== elu register ============================ */ REGISTER_OP_CUDA_KERNEL( @@ -1594,50 +1193,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== relu register ============================ */ -#ifdef PADDLE_WITH_HIP -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#else -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#endif -/* ========================================================================== */ - /* =========================== sigmoid register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, @@ -1650,7 +1205,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidDoubleGradKernel>, ops::SigmoidDoubleGradKernel>); + ops::SigmoidGradGradFunctor>, + ops::SigmoidDoubleGradKernel>); REGISTER_OP_CUDA_KERNEL( sigmoid_triple_grad, @@ -1659,30 +1216,10 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidTripleGradKernel>, ops::SigmoidTripleGradKernel>); -/* ========================================================================== */ - -/* =========================== tanh register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, - CudaTanhGradFunctor); - -REGISTER_OP_CUDA_KERNEL( - tanh_grad_grad, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>, - ops::TanhDoubleGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - tanh_triple_grad, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>, - ops::TanhTripeGradKernel>); + ops::SigmoidTripleGradFunctor>, + ops::SigmoidTripleGradKernel< + plat::CUDADeviceContext, + ops::SigmoidTripleGradFunctor>); /* ========================================================================== */ /* =========================== sqrt register ============================= */ @@ -1696,7 +1233,9 @@ REGISTER_OP_CUDA_KERNEL( ops::SqrtDoubleGradKernel>, ops::SqrtDoubleGradKernel>); + ops::SqrtGradGradFunctor>, + ops::SqrtDoubleGradKernel>); /* ========================================================================== */ /* =========================== rsqrt register ============================= @@ -1726,6 +1265,8 @@ REGISTER_OP_CUDA_KERNEL( ops::SquareGradGradFunctor>, ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel>, ops::SquareDoubleGradKernel { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, - PT_INFER_META(phi::AddmmInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, + PD_INFER_META(phi::AddmmInferMeta)); REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker, ops::AddMMOpGradMaker, ops::AddMMOpGradMaker, diff --git a/paddle/fluid/operators/allclose_op.cc b/paddle/fluid/operators/allclose_op.cc index 8fb9929c39e9223303f4427f1a0d7e1ed66134d4..88d7cb7c1f5f4bf47dc82f8632116424253d6d19 100644 --- a/paddle/fluid/operators/allclose_op.cc +++ b/paddle/fluid/operators/allclose_op.cc @@ -12,52 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/allclose_op.h" #include #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct AllcloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - *out_data = true; - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - *out_data &= val; - } - } -}; - class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -96,40 +64,6 @@ class AllcloseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "Allclose"); - OP_INOUT_CHECK(ctx->HasInput("Other"), "Input", "Other", "Allclose"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Allclose"); - - auto input_dim = ctx->GetInputDim("Input"); - auto other_dim = ctx->GetInputDim("Other"); - PADDLE_ENFORCE_EQ(input_dim.size(), other_dim.size(), - platform::errors::PreconditionNotMet( - "Input(Input) and Input(Other) must have the same " - "dimension size.")); - int n = input_dim.size(); - bool is_runtime = ctx->IsRuntime(); - for (int i = 0; i < n; i++) { - if (is_runtime) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } else { - if (!(input_dim[i] < 0 || other_dim[i] < 0)) { - PADDLE_ENFORCE_EQ(input_dim[i], other_dim[i], - platform::errors::PreconditionNotMet( - "The value at dim %d of Input(Input) is not " - "equal to the Input(Other): %ld != %ld.", - i, input_dim[i], other_dim[i])); - } - } - } - - ctx->SetOutputDim("Out", phi::make_ddim({1})); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -152,13 +86,13 @@ class AllcloseOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(allclose, AllcloseInferShapeFunctor, + PD_INFER_META(phi::AllValueCompareInferMeta)); REGISTER_OPERATOR( allclose, ops::AllcloseOp, ops::AllcloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::AllcloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); + ops::AllcloseOpVarTypeInference, AllcloseInferShapeFunctor); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(allclose) diff --git a/paddle/fluid/operators/allclose_op.cu b/paddle/fluid/operators/allclose_op.cu deleted file mode 100644 index 32c90ff8fdc109b30b140f0f70b336615ce93c17..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/allclose_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/allclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void AllcloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - if (!val) *out_data = false; - } -} - -template -struct AllcloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, sizeof(bool)); -#else - cudaMemset(out_data, true, sizeof(bool)); -#endif - AllcloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(allclose, ops::AllcloseKernel, - ops::AllcloseKernel); diff --git a/paddle/fluid/operators/allclose_op.h b/paddle/fluid/operators/allclose_op.h deleted file mode 100644 index 7a36754194ace5fad14d5a77e9d0be7f1c182087..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/allclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct AllcloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class AllcloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - AllcloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..237cfcc6f1172518097863158ca6dbd595af4186 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_mlu.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CheckFiniteAndUnscaleMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = ctx.template device_context(); + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + found_inf->mutable_data(dev_ctx.GetPlace()); + + MLUCnnlTensorDesc scale_desc(*scale); + MLUCnnlTensorDesc found_inf_desc(*found_inf, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(ctx.GetPlace()); + + // check is_finite or is_nan + Tensor is_finite(found_inf->type()); + if (i != 0) { + is_finite.Resize(phi::make_ddim({1})); + is_finite.mutable_data(ctx.GetPlace()); + } else { + is_finite.ShareDataWith(*found_inf); + } + + MLUCnnlTensorDesc x_desc(*x); + + MLUCnnl::IsNanInf(ctx, x_desc.get(), GetBasePtr(x), + GetBasePtr(&is_finite)); + + // save is_finite by logical_and op after checking every input + if (i != 0) { + MLUCnnlTensorDesc is_finite_desc(is_finite, CNNL_LAYOUT_ARRAY, + ToCnnlDataType()); + MLUCnnl::Logic(ctx, CNNL_LOGIC_OP_OR, found_inf_desc.get(), + GetBasePtr(found_inf), is_finite_desc.get(), + GetBasePtr(&is_finite), found_inf_desc.get(), + GetBasePtr(found_inf)); + } + + // The normal logic is : + // out = in, if found_inf = true + // out = in/scale, if found_inf = false + // But when found_inf is true, the data of Out should not be used. + // So, on MLU, we always compute out with in/scale. + MLUCnnlTensorDesc out_desc(*out); + MLUCnnl::Div(ctx, CNNL_COMPUTATION_HIGH_PRECISION, x_desc.get(), + GetBasePtr(x), scale_desc.get(), GetBasePtr(scale), + out_desc.get(), GetBasePtr(out)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleMLUKernel, + ops::CheckFiniteAndUnscaleMLUKernel); diff --git a/paddle/fluid/operators/amp/fp16_type_traits.h b/paddle/fluid/operators/amp/fp16_type_traits.h index f7aa0de97598df67817d81c1d1c1a5e8356f42ea..56aebe90788fbaa6c300ee9ac620c3d7613ff141 100644 --- a/paddle/fluid/operators/amp/fp16_type_traits.h +++ b/paddle/fluid/operators/amp/fp16_type_traits.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -32,6 +33,12 @@ class MPTypeTrait { using Type = float; }; +template <> +class MPTypeTrait { + public: + using Type = float; +}; + } // namespace details } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 0f5c048b6be9c73ae98181685269592f409196cd..c5e4188ca2d6f749a06127c41da99490a7fb3ffc 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -15,23 +15,19 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); + REGISTER_OPERATOR( arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL( - arg_max, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel); + paddle::framework::EmptyGradOpMaker, + ArgMaxInferShapeFunctor); + REGISTER_OP_VERSION(arg_max) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h deleted file mode 100644 index b77031f7fb4c9d94f30ed06333b9c8766fd2310d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if defined(__NVCC__) || defined(__HIPCC__) - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -namespace { // NOLINT -template -using KeyValuePair = cub::KeyValuePair; -using Tensor = framework::Tensor; - -} // end namespace - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -template -__global__ void ArgCUDAKernel(const int64_t height, // n * h - const int64_t width, // c - const int64_t post_size, // h - const Reducer reducer, const T init, const T* in, - IndType* out) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int idx = blockIdx.x; idx < height; idx += gridDim.x) { - KeyValuePair kv_pair = {-1, init}; - int h = idx / post_size; - int w = idx % post_size; - for (int k = threadIdx.x; k < width; k += blockDim.x) { - kv_pair = - reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair); - } - kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); - if (threadIdx.x == 0) { - out[idx] = static_cast(kv_pair.key); - } - __syncthreads(); - } -} - -template -void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, - Tensor* indices, const int64_t pre, const int64_t post, - const int64_t n) { - auto cu_stream = ctx.stream(); - auto ComputeBlockSize = [](int64_t col) { - auto block_size = 8; - if (col > 512) - block_size = 1024; - else if (col > 256) - block_size = 512; - else if (col > 128) - block_size = 256; - else if (col > 64) - block_size = 128; - else if (col > 32) - block_size = 64; - else if (col > 16) - block_size = 32; - else if (col > 8) - block_size = 16; -#ifdef __HIPCC__ - block_size = std::min(block_size, 256); -#endif - return block_size; - }; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t height = pre * post; - int64_t width = n; - int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; - - const T* in_data = input.data(); - IndType* out_data = indices->mutable_data(ctx.GetPlace()); - - if (typeid(Reducer) == typeid(cub::ArgMax)) { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::lowest(), - in_data, out_data)); - } - } else { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::max(), - in_data, out_data)); - } - } -} - -template -struct VisitDataCudaArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - const bool& flatten = ctx.Attr("flatten"); - - framework::DDim input_dims; - if (flatten) { - input_dims = phi::make_ddim({input->numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - input_dims = input->dims(); - if (axis < 0) axis += input->dims().size(); - } - - int64_t numel = input->numel(); - int64_t groups = numel / input_dims[axis]; - int64_t pre = 1; - int64_t post = 1; - int64_t n = input_dims[axis]; - - for (int i = 0; i < axis; i++) { - pre *= input_dims[i]; - } - - for (int i = axis + 1; i < input_dims.size(); i++) { - post *= input_dims[i]; - } - - const auto& dev_ctx = ctx.cuda_device_context(); - ComputeFullArg(dev_ctx, *input, output, pre, post, n); - } -}; -template -class ArgMinMaxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataCudaArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataCudaArgMinMaxFunctor(ctx)); - } -}; - -#endif - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index d3ce61d183a3d322e40966ce59f9a10320ceab4f..585341beea12c14fbd01a3a47af34ce57def0db5 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -27,193 +27,9 @@ limitations under the License. */ namespace paddle { namespace operators { -enum ArgMinMaxType { kArgMin, kArgMax }; - -template -struct ArgMinMaxFunctor {}; - -#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ - template \ - struct ArgMinMaxFunctor { \ - void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ - framework::LoDTensor* out, framework::DDim x_dims, \ - int64_t axis, bool keepdims) { \ - auto in_eigen = framework::EigenTensor::From(in, x_dims); \ - if (keepdims) { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } else { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } \ - } \ - } - -DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); -DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); - -template -struct VisitDataArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto& x = *(ctx.Input("X")); - auto& out = *(ctx.Output("Out")); - out.template mutable_data(ctx.GetPlace()); - auto axis = ctx.Attr("axis"); - auto keepdims = ctx.Attr("keepdims"); - const bool& flatten = ctx.Attr("flatten"); - // paddle do not have the scalar tensor, just return the shape [1] tensor - if (flatten) keepdims = true; - - // if flatten, will construct the new dims for the cacluate - framework::DDim x_dims; - if (flatten) { - x_dims = phi::make_ddim({x.numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - x_dims = x.dims(); - if (axis < 0) axis += x_dims.size(); - } - auto& dev_ctx = ctx.template device_context(); - -#define CALL_ARG_MINMAX_FUNCTOR(rank) \ - ArgMinMaxFunctor \ - functor##rank; \ - functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims) - - switch (x_dims.size()) { - case 1: - CALL_ARG_MINMAX_FUNCTOR(1); - break; - case 2: - CALL_ARG_MINMAX_FUNCTOR(2); - break; - case 3: - CALL_ARG_MINMAX_FUNCTOR(3); - break; - case 4: - CALL_ARG_MINMAX_FUNCTOR(4); - break; - case 5: - CALL_ARG_MINMAX_FUNCTOR(5); - break; - case 6: - CALL_ARG_MINMAX_FUNCTOR(6); - break; - default: - PADDLE_ENFORCE_LE( - x_dims.size(), 6, - platform::errors::InvalidArgument( - "%s operator doesn't supports tensors whose ranks are greater " - "than 6.", - (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"))); - break; -#undef CALL_ARG_MINMAX_FUNCTOR - } - } -}; - -template -class ArgMinMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataArgMinMaxFunctor(ctx)); - } -}; - -template -using ArgMinKernel = ArgMinMaxKernel; - -template -using ArgMaxKernel = ArgMinMaxKernel; - class ArgMinMaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max"); - const auto& x_dims = ctx->GetInputDim("X"); - int64_t axis = ctx->Attrs().Get("axis"); - bool keepdims = ctx->Attrs().Get("keepdims"); - const bool& flatten = ctx->Attrs().Get("flatten"); - - PADDLE_ENFORCE_GE(axis, -x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -Rank(X)(%d).", - axis, -x_dims.size())); - PADDLE_ENFORCE_LT( - axis, x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis, - x_dims.size())); - - const int& dtype = ctx->Attrs().Get("dtype"); - PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 2 || dtype == 3), true, - platform::errors::InvalidArgument( - "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " - "received [%s]", - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); - - auto x_rank = x_dims.size(); - if (axis < 0) axis += x_rank; - if (ctx->IsRuntime()) { - if (dtype == framework::proto::VarType::INT32) { - int64_t all_element_num = 0; - if (flatten) { - all_element_num = phi::product(x_dims); - - } else { - all_element_num = x_dims[axis]; - } - PADDLE_ENFORCE_LE( - all_element_num, INT_MAX, - platform::errors::InvalidArgument( - "The element num of the argmin/argmax input at axis is " - "%d, is larger than int32 maximum value:%d, you must " - "set the dtype of argmin/argmax to 'int64'.", - all_element_num, INT_MAX)); - } - } - std::vector vec; - if (flatten) { - vec.emplace_back(static_cast(1)); - } else { - for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); - if (keepdims) { - vec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); - } - ctx->SetOutputDim("Out", phi::make_ddim(vec)); - } }; class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 0a4ba6fb0bfdfccfc4eae99da730e96fe5f0a540..fb3abd01af8c396d764f9f1d247f24c41bd15959 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); REGISTER_OPERATOR( arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ArgMinInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - arg_min, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel); REGISTER_OP_VERSION(arg_min) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu deleted file mode 100644 index 23170bf0087906d752767051ce58874cb3584ee5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/arg_min_op.cu +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/arg_min_max_op_base.cu.h" -REGISTER_OP_CUDA_KERNEL( - arg_min, paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel, - paddle::operators::ArgMinMaxOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc index 9e525c20335d37242d0e239e81d2d2976b92a6b4..1a8aca777370bc140e39b7457702557042541744 100644 --- a/paddle/fluid/operators/argsort_op.cc +++ b/paddle/fluid/operators/argsort_op.cc @@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { class ArgsortOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort"); - - auto in_dims = ctx->GetInputDim("X"); - int axis = ctx->Attrs().Get("axis"); - - auto num_dims = in_dims.size(); - PADDLE_ENFORCE_GE(axis, -num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -num_dims(%d).", - axis, -num_dims)); - PADDLE_ENFORCE_LT( - axis, num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be less than num_dims(%d).", axis, num_dims)); - - ctx->ShareDim("X", "Out"); - ctx->ShareDim("X", "Indices"); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } }; class ArgsortGradOp : public framework::OperatorWithKernel { @@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor, + PD_INFER_META(phi::ArgsortInferMeta)); REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker, ops::ArgsortGradOpMaker, - ops::ArgsortGradOpMaker); + ops::ArgsortGradOpMaker, + ArgsortInferShapeFunctor); REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp, ops::ArgsortGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(argsort, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel); -REGISTER_OP_CPU_KERNEL( - argsort_grad, ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel); diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu deleted file mode 100644 index 8b7a0b3eadb16bbe0822809748e343dc0d793a0f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/argsort_op.cu +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/argsort_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -#ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; -} // namespace detail -} // namespace rocprim -#else -// set cub base traits in order to handle float16 -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// Iter for move to next row -struct SegmentOffsetIter { - EIGEN_DEVICE_FUNC - explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { - return idx * num_cols_; - } - - int num_cols_; -}; - -template -static __global__ void FillIndex(T* indices, T num_rows, T num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (T j = row_id; j < num_rows; j += gridDim.x) { - for (T i = col_id; i < num_cols; i += blockDim.x) { - indices[j * num_cols + i] = i; - } - } -} - -template -static __global__ void FillFlattenGrad(const T* dO, const IndType* indices, - int64_t size, T* dX) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < size; i += stride) { - dX[indices[i]] = dO[i]; - } -} - -template -static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX, - IndType num_rows, IndType num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (IndType j = row_id; j < num_rows; j += gridDim.x) { - for (IndType i = col_id; i < num_cols; i += blockDim.x) { - dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i]; - } - } -} - -// Sort by flag descending, True: descending. False: Ascending. -// Default is false. -template -void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, - Tensor* output, Tensor* indices, const IndType num_rows, - const IndType num_cols, const bool descending) { - auto cu_stream = ctx.stream(); - - Tensor input_indices; - - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - - size_t temp_storage_bytes = -1; - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - // Init a index array - FillIndex<<>>( - input_indices.data(), num_rows, num_cols); - - T* sorted_out_ptr; - IndType* sorted_indices_ptr; - - const T* inp = input->data(); - T* out = output->mutable_data(ctx.GetPlace()); - IndType* ind = indices->mutable_data(ctx.GetPlace()); - - sorted_out_ptr = out; - sorted_indices_ptr = ind; - - // create iter for counting input - cub::CountingInputIterator counting_iter(0); - // segment_offset is used for move to next row - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - - gpuError_t err; - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - PADDLE_ENFORCE_GPU_SUCCESS(err); - - Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - - PADDLE_ENFORCE_GPU_SUCCESS(err); -} - -template -void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, Tensor* dX, const IndType num_rows, - const IndType num_cols) { - auto cu_stream = ctx.stream(); - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - FillGrad<<>>( - dO->data(), indices->data(), dX->data(), num_rows, - num_cols); -} - -template -void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, int64_t size, Tensor* dX) { - auto cu_stream = ctx.stream(); - - const int64_t block_size = - std::min(size, static_cast(ctx.GetMaxThreadsPerBlock())); - int64_t max_threads = ctx.GetMaxPhysicalThreadCount(); - const int64_t max_blocks = - std::max(((max_threads - 1) / block_size + 1), static_cast(1)); - const int64_t grid_size = - std::min(max_blocks, (size + block_size - 1) / block_size); - - FillFlattenGrad<<>>( - dO->data(), indices->data(), size, dX->data()); -} - -template -class ArgsortOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - const T* in_data = input->data(); - auto size = input->numel(); - T* out_data = output->mutable_data(ctx.GetPlace()); - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ‘axis’ dimension. - // Compared to the following 'Special case for full sort', ascending sort is - // 34 times faster and descending sort is 31 times faster. - if (size == in_dims[axis]) { - thrust::sequence(thrust::device, ids_data, ids_data + size); - thrust::copy(thrust::device, in_data, in_data + size, out_data); - thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data); - if (descending) { - thrust::reverse(thrust::device, out_data, out_data + size); - thrust::reverse(thrust::device, ids_data, ids_data + size); - } - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - ArgFullSort(dev_ctx, input, output, indices, input_height, - input_width, descending); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - T* trans_inp_data = trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - T* out_data = output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - // temp indices for sorting - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); - - ArgFullSort(dev_ctx, &trans_inp, &tmp_out, &tmp_indices, - input_height, input_width, descending); - - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - return; - } - } -}; - -template -class ArgsortGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - dX->mutable_data(ctx.GetPlace()); - if (dO->numel() == 0) return; - - auto in_dims = dX->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - int64_t size = dX->numel(); - const auto& dev_ctx = ctx.cuda_device_context(); - - // Parallel acceleration when the input size is equal to the length of the - // ‘axis’ dimension. - // Compared to 'special case for full sort' below, the gradient calculation - // is 10 times faster. - if (size == in_dims[axis]) { - ArgFlattenAssign(dev_ctx, dO, indices, size, dX); - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - ArgFullAssign(dev_ctx, dO, indices, dX, input_height, - input_width); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - ArgFullAssign(dev_ctx, &trans_dO, &trans_ind, &tmp_out, - input_height, input_width); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - return; - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - argsort, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h deleted file mode 100644 index d850e51a4bf061d3e5fc46bd53a2ef56610d6de9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/argsort_op.h +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -using Tensor = framework::Tensor; - -template -static void FullSort(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - bool descending) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [&](const std::pair& l, const std::pair& r) { - if (descending) - return l.first > r.first; - else - return l.first < r.first; - }); - - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + j] = col_vec[j].first; - t_indices[i * input_width + j] = col_vec[j].second; - } - } -} - -template -static void FullAssign(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, - const framework::Tensor* indices, T* t_out) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - auto e_indices = EigenVector::Flatten(*indices); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class ArgsortKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - T* out_data = output->mutable_data(ctx.GetPlace()); - - // Do full sort - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - FullSort(input_height, input_width, in_dims.size(), input, - out_data, ids_data, descending); - } else { - // If not full sort do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - - auto* t_ind = - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - - FullSort(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, descending); - - indices->mutable_data(ctx.GetPlace()); - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - } - } -}; - -template -class ArgsortGradientKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - auto in_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto& place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - // Do full assign - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - FullAssign(input_height, input_width, in_dims.size(), dO, - indices, dX->data()); - } else { - // If not full assign do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - FullAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc index 077be715bece0b4119dc0a578a1cba4631eb45f2..c927eec00bc8bf9e84ad1fb53a907ff8ec71acbc 100644 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ b/paddle/fluid/operators/argsort_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc index 18e81936a16c63a1d2693dfb47dc618c3e707ae0..359b00fcf87ee1bee27e668ae3973fa39be19d76 100644 --- a/paddle/fluid/operators/argsort_op_xpu.cc +++ b/paddle/fluid/operators/argsort_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 72488a932d9c33cbfeddc9f35818e42ebe0137fa..b452dea8536dd98d6d4060d5224e39daf9137c50 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc index 71a895c244c54f62c0af1745635c08fea35436c4..0783b30a8580db403255211d879d9400a1e82ab7 100644 --- a/paddle/fluid/operators/atan2_op.cc +++ b/paddle/fluid/operators/atan2_op.cc @@ -105,8 +105,8 @@ class Atan2OpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, - PT_INFER_META(phi::Atan2InferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(atan2, Atan2InferShapeFunctor, + PD_INFER_META(phi::Atan2InferMeta)); REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker, ops::Atan2GradMaker, ops::Atan2GradMaker, diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index a23e484d0a88bb87febc6d320f9183ef50ea0ebc..78ea8b6b6fbebd7e0ca5ce14cc2cba6ff197177f 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/attention_lstm_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" namespace paddle { namespace operators { @@ -269,10 +269,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + phi::funcs::vec_add_bias(n, *bias, x, y); + phi::funcs::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + phi::funcs::vec_relu(n, x, y); } } @@ -283,14 +283,14 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + phi::funcs::vec_add_bias(n, -scalar, x, y); // sub + phi::funcs::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { scalar += y[i]; } - math::vec_scal(n, static_cast(1) / scalar, y); // scale + phi::funcs::vec_scal(n, static_cast(1) / scalar, y); // scale } template @@ -344,12 +344,12 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); if (platform::MayIUse(platform::avx)) { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 949cf021cf0fa322970c210fa26f698fd2bc45b2..174207deb08b84194d6f20fe04e4c27245295caf 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ops::BatchNormDoubleGradMaker); REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, ops::BatchNormDoubleGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index d59396db1517faadaa2dd9e9af770d2e8a23ec56..a19b087245a89a4a12f062b1ce27835b98ecfd66 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; -template -static __global__ void BNForwardInference( - const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int num = N * C * HxW; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType x_sub_mean = - static_cast>(x[i]) - mean[c]; - BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); - y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( - const T *x, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, double exponentialAverageFactor, T *y, - BatchNormParamType *mean, BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - int outer_size = C; - int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - variance_val = x_square_sum / inner_size - mean_val * mean_val; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - bool test_mode = is_test && (!trainable_stats); - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5" - "But received: the size of input's dimensions is [%d]", - x_dims.size())); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - test_mode || - (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); - - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_y(y->type()); - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, y, - &transformed_y); - } else { - transformed_x.ShareDataWith(*x); - transformed_y.ShareDataWith(*y); - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - - VLOG(3) << "Setting descriptors."; - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * D * C, 1, W * D * C, D * C, C}; - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// Note: PERSISTENT not implemented for inference -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor( -// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, - test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); -#endif - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto &dev_ctx = ctx.template device_context(); - - auto handle = dev_ctx.cudnn_handle(); - - // Now, depending on whether we are running test or not, we have two paths. - // It is training mode when it's not reference AND not using pre-trained - // model. - bool training = !test_mode && !use_global_stats; - if (!training) { - // only when test we use input to do computation. - const auto *est_mean = ctx.Input("Mean"); - const auto *est_var = ctx.Input("Variance"); - // Run inference mode. - PADDLE_ENFORCE_EQ( - est_mean->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of mean's dimensions must equal to 1." - "But received: the size of mean's dimensions mean is [%d]," - "the dimensions of mean is [%s].", - est_mean->dims().size(), est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of variance's dimensions must equal to 1." - "But received: the size of variance's dimensions is [%d]," - "the dimensions of variance is [%s].", - est_var->dims().size(), est_var->dims())); - PADDLE_ENFORCE_EQ( - est_mean->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of mean must equal to the number of " - "Channels, which is [%d]. But received: the first dimension" - "of mean is [%d], the dimensions of mean is [%s].", - C, est_mean->dims()[0], est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of variance must equal to the number" - "of Channels, which is [%d]. But received: the first dimension of" - "variance is [%d], the dimensions of variance is [%s].", - C, est_var->dims()[0], est_var->dims())); - -#ifdef PADDLE_WITH_HIP - const int block_size = 256; - const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; - if (compute_format == DataLayout::kNCHW) { - BNForwardInference< - T, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } else { - BNForwardInference< - T, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardInference( -// handle, miopenBNSpatial, -// const_cast( -// static_cast(CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// const_cast(static_cast( -// est_mean->template data>())), -// const_cast(static_cast( -// est_var->template data>())), -// epsilon)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardInference( - handle, - // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), - est_mean->template data>(), - est_var->template data>(), epsilon)); -#endif - } else { - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - Tensor mom_cpu; - paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), - &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - // Run training mode. - // obtain running mean and running inv var, and there is no need - // to initialize them. - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); - - if ((N * H * W * D) == 1) { - // Only 1 element in normalization dimension, - // skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - } else { - double this_factor = 1. - momentum; - - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - size_t reserve_space_size = 0; - void *reserve_space_ptr = nullptr; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - // Create reserve space and workspace for batch norm. - // Create tensor for each batchnorm op, it will be used in the - // backward. Thus this tensor shouldn't be temp. - auto *reserve_space = ctx.Output("ReserveSpace"); - PADDLE_ENFORCE_NOT_NULL( - reserve_space, - platform::errors::NotFound( - "The argument ReserveSpace of batch_norm op is not found.")); - - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*zDesc=*/nullptr, - /*yDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*activationDesc=*/nullptr, - /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); - - reserve_space_ptr = reserve_space->mutable_data( - ctx.GetPlace(), transformed_x.type(), reserve_space_size); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTrainingEx( - handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), nullptr, nullptr, data_desc_, - transformed_y.template data(), bn_param_desc_, - scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()), - nullptr, workspace_ptr, workspace_size, reserve_space_ptr, - reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - const int num = transformed_x.numel(); - const int block = 256; - const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< - T, block, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } else { - BNForwardTraining< - T, block, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardTraining( -// handle, mode_, const_cast(static_cast( -// CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// this_factor, -// static_cast( -// mean_out->template mutable_data>( -// ctx.GetPlace())), -// static_cast(variance_out->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())), -// epsilon, -// static_cast( -// saved_mean->template mutable_data>( -// ctx.GetPlace())), -// static_cast(saved_variance->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())))); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); -#endif - } - } - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_y, y); - } -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( - const T *dy, const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const double epsilon, const int N, - const int C, const int HxW, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); - BatchNormParamType mean_i = mean[i]; - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - ds_sum += static_cast>(dy[index]) * - (static_cast>(x[index]) - mean_i); - db_sum += static_cast>(dy[index]); - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale[i] = ds_sum * inv_var_i; - dbias[i] = db_sum; - } - __syncthreads(); - } -} - -template -static __global__ void KeBNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *variance, - const double epsilon, const int C, - const int HxW, const int num, T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = static_cast(static_cast>(dy[i]) * - scale[c] * inv_var); - } -} - -template -static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, - double epsilon, int C, int M, - const int num, const T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - auto y_i = static_cast>(y[i]); - auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; - x[i] = static_cast(x_i); - } -} - -template -class InplaceHelper { - public: - void operator()(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, double epsilon, int C, - int M, const int num, const T *y, int grid2, const int block, - const gpuStream_t &stream) { - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y should be inplaced in inplace mode")); - KeBNRestoreData<<>>( - layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( - const T *dy, const T *x, const BatchNormParamType *scale, - const BatchNormParamType *saved_mean, - const BatchNormParamType *saved_inv_variance, const int C, const int N, - const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType inv_var_val; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType dscale_val; - __shared__ BatchNormParamType dbias_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - if (saved_mean && saved_inv_variance) { - if (threadIdx.x == 0) { - inv_var_val = saved_inv_variance[i]; - mean_val = saved_mean[i]; - } - } else { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = - static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = - static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - inv_var_val = - 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); - } - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - ds_sum += - dy_i * (static_cast>(x[index]) - mean_val); - db_sum += dy_i; - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale_val = ds_sum * inv_var_val; - dbias_val = db_sum; - dscale[i] = dscale_val; - dbias[i] = dbias_val; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = scale[i] * inv_var_val * - (static_cast>(dy[index]) - - dbias_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_val) * - inv_var_val * dscale_val / inner_size); - } - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( - const T *dy, const BatchNormParamType *scale, - const BatchNormParamType *mean, const T *x, - const BatchNormParamType *variance, const int C, const int N, - const int HxW, T *dx) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; - __shared__ BatchNormParamType dy_sum_val; - __shared__ BatchNormParamType dy_x_sub_mean_sum_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType inv_var_i = variance[i]; - BatchNormParamType mean_i = mean[i]; - BatchNormParamType dy_sum = static_cast>(0); - BatchNormParamType dy_x_sub_mean_sum = - static_cast>(0); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - dy_sum += dy_i; - dy_x_sub_mean_sum += - dy_i * (static_cast>(x[index]) - mean_i); - } - - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) - .Reduce(dy_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; - } - __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = - (static_cast>(dy[index]) - - dy_sum_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_i) * - dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * - scale[i] * inv_var_i; - } - } -} - -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - const bool is_test = ctx.Attr("is_test"); - use_global_stats = is_test || use_global_stats; - - const auto &x_dims = x->dims(); - - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5." - "But received: the size of input's dimensions is [%d]," - "the dimensions of input is [%s]", - x_dims.size(), x_dims)); - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - if (d_scale && d_bias) { - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - } - PADDLE_ENFORCE_EQ( - scale->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of scale's dimensions must equal to 1. But received: " - "the size of scale's dimensions is [%d], the dimensions of scale " - "is [%s].", - scale->dims().size(), scale->dims())); - PADDLE_ENFORCE_EQ( - scale->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of scale must equal to Channels[%d]. But " - "received: the first dimension of scale is [%d]", - C, scale->dims()[0])); - - auto dtype = platform::CudnnDataType::type; - const auto *reserve_space = ctx.Input("ReserveSpace"); -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && - reserve_space != nullptr; - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_d_y(d_y->type()); - Tensor transformed_d_x; - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, d_y, - &transformed_d_y); - TransToChannelFirst(ctx, d_y, - &transformed_d_y); - if (d_x) { - ResizeToChannelFirst(ctx, d_x, - &transformed_d_x); - } - } else { - transformed_x.ShareDataWith(*x); - transformed_d_y.ShareDataWith(*d_y); - if (d_x) { - transformed_d_x.ShareDataWith(*d_x); - } - } - - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * C * D, 1, W * D * C, D * C, C}; - } - - auto &dev_ctx = ctx.template device_context(); - const int num = transformed_x.numel(); -#ifdef HIPCC - const int block = 256; -#else - const int block = 512; -#endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid1 = (num + block - 1) / block; - int grid2 = std::min(C, max_blocks); - auto stream = dev_ctx.stream(); - InplaceHelper inplace_functor; - - if (!use_global_stats) { - if ((N * H * W * D) == 1) { - if (d_x) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - } - phi::funcs::SetConstant> - functor; - functor(dev_ctx, d_scale, static_cast>(0)); - functor(dev_ctx, d_bias, static_cast>(0)); - return; - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, -// data_desc_, mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); -#endif - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const auto *saved_mean_data = - saved_mean->template data>(); - const auto *saved_var_data = - saved_var->template data>(); - - if (is_inplace) { - inplace_functor(compute_format, transformed_x.data(), - scale->template data>(), - bias->template data>(), - saved_mean_data, saved_var_data, epsilon, C, H * W * D, - num, transformed_x.data(), grid2, block, stream); - } - - // This branch calls CUDNN APIs - if (d_x && d_scale && d_bias) { - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - auto reserve_space_size = reserve_space->memory_size(); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/nullptr, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/transformed_x.template data(), - /*yDesc=*/nullptr, - /*yData=*/nullptr, - /*dyDesc=*/data_desc_, - /*dyData=*/transformed_d_y.template data(), - /*dzDesc=*/nullptr, - /*dzData=*/nullptr, - /*dxDesc=*/data_desc_, - /*dxData=*/transformed_d_x.template mutable_data( - ctx.GetPlace()), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale->template data>(), - /*bnBiasData=*/nullptr, - /*dBnScaleData=*/d_scale - ->template mutable_data>( - ctx.GetPlace()), - /*dBnBiasData=*/d_bias - ->template mutable_data>( - ctx.GetPlace()), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesc=*/nullptr, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast( - reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - if (compute_format == DataLayout::kNCHW) { - BNBackward< - T, block, - DataLayout::kNCHW><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } else { - BNBackward< - T, block, - DataLayout::kNHWC><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationBackward( -// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), -// CudnnDataType::kZero(), CudnnDataType::kOne(), -// CudnnDataType::kZero(), data_desc_, -// transformed_x.template data(), data_desc_, -// transformed_d_y.template data(), data_desc_, -// transformed_d_x.template mutable_data(ctx.GetPlace()), -// bn_param_desc_, scale->template data>(), -// d_scale->template mutable_data>( -// ctx.GetPlace()), -// d_bias->template mutable_data>( -// ctx.GetPlace()), -// epsilon, saved_mean_data, saved_var_data)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); -#endif - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_d_x, d_x); - } - } else { - // This branch call CUDA kernels - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - - const auto *running_mean_data = - running_mean->template data>(); - const auto *running_var_data = - running_var->template data>(); - - if (is_inplace) { - auto px = *x; - inplace_functor(data_layout, px.mutable_data(ctx.GetPlace()), - scale->template data>(), - bias->template data>(), - running_mean_data, running_var_data, epsilon, C, - H * W * D, num, x->data(), grid2, block, stream); - } - - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNCHW><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNHWC><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } - } - } -}; - -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const double epsilon = static_cast(ctx.Attr("epsilon")); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - - NormDoubleGradFunctor( - ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon, - use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY); - } -}; - } // namespace operators } // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); -#endif diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc index 55bb57466c7b5ec4f4ac3c51b1cf84ab5098a0e9..bc9076f4d7c368f60187e9e432dd175d1f5ad45b 100644 --- a/paddle/fluid/operators/bce_loss_op.cc +++ b/paddle/fluid/operators/bce_loss_op.cc @@ -138,8 +138,8 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, - PT_INFER_META(phi::BCELossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, + PD_INFER_META(phi::BCELossInferMeta)); REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker, ops::BCELossGradOpMaker, diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc index 4774c0a1dbc3b78607d75efb7bc82d590ca4aa2a..9f6a78ab7a55f32558accd56e69d757003bad89c 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cc +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -90,12 +90,12 @@ class BilinearTensorProductGradOpMaker namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, +DECLARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, BilinearTensorProductInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR( + PD_INFER_META(phi::BilinearTensorProductInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR( bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor, - PT_INFER_META(phi::BilinearTensorProductGradInferMeta)); + PD_INFER_META(phi::BilinearTensorProductGradInferMeta)); REGISTER_OPERATOR( bilinear_tensor_product, ops::BilinearTensorProductOp, diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc index b37334a14bad4fdc342d8fba13c117bfad5bd65c..062e7d510d54c0f657582d48844093d94732971e 100644 --- a/paddle/fluid/operators/bincount_op.cc +++ b/paddle/fluid/operators/bincount_op.cc @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bincount_op.h" - #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -28,51 +31,6 @@ class BincountOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of BincountOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of BincountOp should not be null.")); - - auto input_dim = ctx->GetInputDim("X"); - auto minlength = ctx->Attrs().Get("minlength"); - - PADDLE_ENFORCE_GE(minlength, 0, - platform::errors::InvalidArgument( - "The minlength should be greater than or equal to 0." - "But received minlength is %d", - minlength)); - - PADDLE_ENFORCE_EQ(input_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) must be 1-D tensor." - "But the dimension of Input(X) is [%d]", - input_dim.size())); - - if (ctx->HasInput("Weights")) { - auto weights_dim = ctx->GetInputDim("Weights"); - PADDLE_ENFORCE_EQ(weights_dim.size(), 1, - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be 1-D tensor." - "But the dimension of Input(Weights) is [%d]", - weights_dim.size())); - - PADDLE_ENFORCE_EQ( - weights_dim[0], input_dim[0], - platform::errors::InvalidArgument( - "The 'shape' of Input(Weights) must be equal to the 'shape' of " - "Input(X)." - "But received: the 'shape' of Input(Weights) is [%s]," - "the 'shape' of Input(X) is [%s]", - weights_dim, input_dim)); - } - - ctx->SetOutputDim("Out", phi::make_ddim({-1})); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const { auto data_type = @@ -105,12 +63,10 @@ class BincountOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(bincount, BincountInferShapeFunctor, + PD_INFER_META(phi::BincountInferMeta)); REGISTER_OPERATOR( bincount, ops::BincountOp, ops::BincountOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - bincount, ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel, - ops::BincountKernel); + paddle::framework::EmptyGradOpMaker, + BincountInferShapeFunctor); diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu deleted file mode 100644 index cc576d0af92877dff44d672597596036be0defbc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bincount_op.cu +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/bincount_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using platform::PADDLE_CUDA_NUM_THREADS; - -inline int GET_BLOCKS(const int N) { - return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; -} - -template -__global__ void KernelBincount(const InputT* input, const int total_elements, - const bool has_weights, const T* weights, - OutT* output) { - if (!has_weights) { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); - } - } else { - for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&output[input[i]], - static_cast(weights[i])); - } - } -} - -template -void BincountCUDAInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - const int input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - auto input_x = framework::EigenVector::Flatten(*input); - - framework::Tensor input_min_t, input_max_t; - auto* input_max_data = - input_max_t.mutable_data({1}, context.GetPlace()); - auto* input_min_data = - input_min_t.mutable_data({1}, context.GetPlace()); - - auto input_max_scala = framework::EigenScalar::From(input_max_t); - auto input_min_scala = framework::EigenScalar::From(input_min_t); - - auto* place = context.template device_context().eigen_device(); - input_max_scala.device(*place) = input_x.maximum(); - input_min_scala.device(*place) = input_x.minimum(); - - Tensor input_min_cpu, input_max_cpu; - paddle::framework::TensorCopySync(input_max_t, platform::CPUPlace(), - &input_max_cpu); - paddle::framework::TensorCopySync(input_min_t, platform::CPUPlace(), - &input_min_cpu); - - InputT input_min = input_min_cpu.data()[0]; - - PADDLE_ENFORCE_GE( - input_min, static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = - static_cast(input_max_cpu.data()[0]) + 1L; - - output_size = std::max(output_size, static_cast(minlength)); - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - const T* weights_data = has_weights ? weights->data() : nullptr; - - auto stream = - context.template device_context().stream(); - - if (!has_weights) { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - - KernelBincount<<>>( - input_data, input_numel, has_weights, weights_data, output_data); - } - } -} - -template -class BincountCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountCUDAInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountCUDAInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bincount, ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel, - ops::BincountCUDAKernel); diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h deleted file mode 100644 index 84256bf78e4a1901b76b356c5e3274541dc0dd59..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/bincount_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void BincountInner(const framework::ExecutionContext& context) { - const Tensor* input = context.Input("X"); - const Tensor* weights = context.Input("Weights"); - Tensor* output = context.Output("Out"); - auto& minlength = context.Attr("minlength"); - - const InputT* input_data = input->data(); - - auto input_numel = input->numel(); - - if (input_data == nullptr) { - framework::DDim out_dim{0}; - output->Resize(out_dim); - output->mutable_data(context.GetPlace()); - return; - } - - PADDLE_ENFORCE_GE( - *std::min_element(input_data, input_data + input_numel), - static_cast(0), - platform::errors::InvalidArgument( - "The elements in input tensor must be non-negative ints")); - - int64_t output_size = static_cast(*std::max_element( - input_data, input_data + input_numel)) + - 1L; - output_size = std::max(output_size, static_cast(minlength)); - - framework::DDim out_dim{output_size}; - output->Resize(out_dim); - - bool has_weights = (weights != nullptr); - - if (has_weights) { - const T* weights_data = weights->data(); - const auto& weights_type = framework::TransToProtoVarType(weights->dtype()); - if (weights_type == framework::proto::VarType::FP32) { - float* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } else { - double* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += static_cast(weights_data[i]); - } - } - - } else { - int64_t* output_data = output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant()( - context.template device_context(), output, 0L); - for (int64_t i = 0; i < input_numel; i++) { - output_data[input_data[i]] += 1L; - } - } -} - -template -class BincountKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("X"); - const auto& input_type = framework::TransToProtoVarType(input->dtype()); - - if (input_type == framework::proto::VarType::INT32) { - BincountInner(context); - } else if (input_type == framework::proto::VarType::INT64) { - BincountInner(context); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc index c3917fad555cb4633d4d958abcde0244fae13cae..1063a8b7992153dbedcdc0442ac3d8038c5e171b 100644 --- a/paddle/fluid/operators/broadcast_tensors_op.cc +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -167,9 +167,9 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, +DECLARE_INFER_SHAPE_FUNCTOR(broadcast_tensors, BroadcastTensorsInferShapeFunctor, - PT_INFER_META(phi::BroadcastTensorsInferMeta)); + PD_INFER_META(phi::BroadcastTensorsInferMeta)); REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, ops::BroadcastTensorsOpMaker, diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc index 09e915a6bafd4a8b72f35995b3ebbfeafa00476a..ed80ac076c0af7fc8922f095d4be4613bc5057ec 100644 --- a/paddle/fluid/operators/cholesky_op.cc +++ b/paddle/fluid/operators/cholesky_op.cc @@ -90,8 +90,8 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, - PT_INFER_META(phi::CholeskyInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, + PD_INFER_META(phi::CholeskyInferMeta)); REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker, ops::CholeskyGradOpMaker, ops::CholeskyGradOpMaker, diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc index 6b5bae8fc73fe2b71212a93144d89144dd0268c6..5403e2440ee58f1cf7cbad107f4d3e174655ed3b 100644 --- a/paddle/fluid/operators/cholesky_solve_op.cc +++ b/paddle/fluid/operators/cholesky_solve_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/operators/solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker { class CholeskySolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve"); - OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve"); - auto u_dims = context->GetInputDim("Y"); - auto b_dims = context->GetInputDim("X"); - int u_rank = u_dims.size(); - int b_rank = b_dims.size(); - PADDLE_ENFORCE_GE(u_rank, 2, - platform::errors::InvalidArgument( - "the rank of input Y must greater or equal to 2")); - PADDLE_ENFORCE_GE(b_rank, 2, - platform::errors::InvalidArgument( - "the rank of input X must greater or equal to 2")); - PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "input Matrix Y should be square matrix," - "But Got last shape of %ld x %ld", - u_dims[u_rank - 1], u_dims[u_rank - 2])); - PADDLE_ENFORCE_EQ( - b_dims[b_rank - 2], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "the first dim of input X must equal to the dim of input Y," - "But Got %ld and %ld", - b_dims[b_rank - 2], u_dims[u_rank - 2])); - - std::vector u_dims_vec = phi::vectorize(u_dims); - std::vector b_dims_vec = phi::vectorize(b_dims); - - std::vector u_dims_vec_cut(u_dims_vec.begin(), - u_dims_vec.end() - 2); - std::vector b_dims_vec_cut(b_dims_vec.begin(), - b_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut); - - std::vector b_broadcast_dims({expand_batch_portion}); - b_broadcast_dims.insert(b_broadcast_dims.end(), - {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims)); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor, + PD_INFER_META(phi::CholeskySolveInferMeta)); + REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp, ops::CholeskySolveOpMaker, ops::CholeskySolveOpVarTypeInference, ops::CholeskySolveOpGradMaker, - ops::CholeskySolveOpGradMaker); + ops::CholeskySolveOpGradMaker, + CholeskySolveInferShapeFunctor); REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); -// Complex<> is not supported because of TensorExpand, which used to boardcast -// input Tensor diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu deleted file mode 100644 index 1b551a7cd0343db32a84e962212a25e1ff5a4893..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver - -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; - -template -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, - int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb, - int *devInfo); - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, float *Adata, - int lda, float *Bdata, int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, - double *Adata, int lda, double *Bdata, int ldb, - int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs( - cusolverH, uplo, n, nrhs, reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs( - cusolverH, uplo, n, nrhs, - reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - cublasFillMode_t uplo = - upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; - - /* step 1: get cusolver handle*/ - auto cusolverH = dev_ctx.cusolver_dn_handle(); - - /* step 2: solve A0*X0 = B0 */ - cusolver_potrs(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - } -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &in, Tensor *out, - const framework::ExecutionContext &ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CUDA_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h deleted file mode 100644 index f25fbbb0c698036951c4b9ae8e9ad2778786a1a2..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ /dev/null @@ -1,248 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/triangular_solve_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" -#include "paddle/phi/kernels/math_kernel.h" - -namespace paddle { -namespace operators { // namespace operators - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo); -}; - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - char uplo = upper ? 'U' : 'L'; - phi::funcs::lapackCholeskySolve(uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - } -}; - -template -void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, - const framework::Tensor &uin, - const framework::Tensor &bin, framework::Tensor *out, - bool upper) { - const auto &dev_ctx = ctx.template device_context(); - // framework::Tensor broadcast - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin); - framework::Tensor u_bst(uin.type()); - TensorExpand(dev_ctx, uin, &u_bst, u_bst_dims_vec); - - framework::Tensor b_bst(bin.type()); - TensorExpand(dev_ctx, bin, &b_bst, b_bst_dims_vec); - - math::DeviceIndependenceTensorOperations helper(ctx); - - // calculate u's conjugate for complex - framework::Tensor u_conj(u_bst.type()); - platform::ForRange u_for_range(dev_ctx, u_bst.numel()); - phi::funcs::ConjFunctor u_functor( - u_bst.data(), u_bst.numel(), - u_conj.mutable_data(u_bst.dims(), dev_ctx.GetPlace())); - u_for_range(u_functor); - u_conj = helper.Transpose(u_conj); - - // calculate b's conjugate for complex - framework::Tensor b_conj(b_bst.type()); - platform::ForRange b_for_range(dev_ctx, b_bst.numel()); - phi::funcs::ConjFunctor b_functor( - b_bst.data(), b_bst.numel(), - b_conj.mutable_data(b_bst.dims(), dev_ctx.GetPlace())); - b_for_range(b_functor); - b_conj = helper.Transpose(b_conj); - - auto ut_data = u_conj.mutable_data(dev_ctx.GetPlace()); - auto uindims = u_bst.dims(); - auto bindims = b_bst.dims(); - int uinrank = uindims.size(); - int binrank = bindims.size(); - - int n = uindims[uinrank - 2]; - int nrhs = bindims[binrank - 1]; - int ldab = std::max(1, n); - - // framework::Tensor out_copy(b_conj.type()); - // out_copy.Resize(b_conj.dims()); - framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out); - T *out_data = out->mutable_data(dev_ctx.GetPlace()); - - auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2); - auto batchsize = product(info_dims); - - framework::Tensor tmp; - std::vector tmpdim(1, batchsize); - tmp.Resize(phi::make_ddim(tmpdim)); - int *info = tmp.mutable_data(dev_ctx.GetPlace()); - - CholeskySolveFunctor functor; - for (int b = 0; b < batchsize; b++) { - auto uin_data_item = &ut_data[b * n * n]; - auto out_data_item = &out_data[b * n * nrhs]; - auto info_item = &info[b]; - functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item, - info_item); - } - - // calculate out's conjugate for complex - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out->mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - *out = helper.Transpose(*out); -} - -template -class CholeskySolveKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *uin = ctx.Input("Y"); - auto *bin = ctx.Input("X"); - auto *out = ctx.Output("Out"); - auto upper = ctx.Attr("upper"); - cholesky_solve_fn(ctx, *uin, *bin, out, upper); - } -}; - -template -class CholeskySolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *bin = ctx.Input("X"); - auto *uin = ctx.Input("Y"); - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *db = ctx.Output(framework::GradVarName("X")); - auto *du = ctx.Output(framework::GradVarName("Y")); - auto upper = ctx.Attr("upper"); - - const auto &dev_ctx = ctx.template device_context(); - math::DeviceIndependenceTensorOperations helper(ctx); - - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin); - framework::Tensor u_bst(uin->type()); - TensorExpand(dev_ctx, *uin, &u_bst, u_bst_dims_vec); - - framework::Tensor db_bst(bin->type()); - TensorExpand(dev_ctx, *bin, &db_bst, b_bst_dims_vec); - - if (dout) { - db->mutable_data(dev_ctx.GetPlace()); - cholesky_solve_fn(ctx, u_bst, *dout, &db_bst, upper); - - if (db_bst.dims() == db->dims()) { - framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db); - } else { - MatrixReduceSumFunctor functor; - functor(db_bst, db, ctx); - db->Resize(bin->dims()); - } - - auto blas = phi::funcs::GetBlas(ctx); - - // calculate out's conjugate for complex - framework::Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - out_conj = helper.Transpose(out_conj); - - framework::Tensor commonterm(out->type()); - auto outdims = out_conj.dims(); - auto dbdims = db_bst.dims(); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false); - auto cmtdim = outdims; - cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2]; - commonterm.Resize(cmtdim); - commonterm.mutable_data(dev_ctx.GetPlace()); - blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast(1), - &commonterm, static_cast(0)); - - // calculate commonterm's conjugate for complex - framework::Tensor commonterm_conj(commonterm.type()); - platform::ForRange commonterm_for_range( - dev_ctx, commonterm.numel()); - phi::funcs::ConjFunctor commonterm_functor( - commonterm.data(), commonterm.numel(), - commonterm_conj.mutable_data(commonterm.dims(), - dev_ctx.GetPlace())); - commonterm_for_range(commonterm_functor); - commonterm_conj = helper.Transpose(commonterm_conj); - - phi::AddRawKernel( - static_cast::TYPE &>(dev_ctx), - commonterm, commonterm_conj, -1, &commonterm); - - auto mat_dim_u = - phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false); - - Tensor du_bst(uin->type()); - // get upper or lower triangular - du_bst.Resize(u_bst.dims()); - du_bst.mutable_data(dev_ctx.GetPlace()); - if (upper) { - blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast(-1), - &du_bst, static_cast(0)); - } else { - blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast(-1), - &du_bst, static_cast(0)); - } - - const auto &udims = u_bst.dims(); - const auto H = udims[udims.size() - 2]; - const auto W = udims[udims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, u_bst.numel()); - TrilTriuCompute tril_triu_computer(du_bst.data(), 0, !upper, H, W, - u_bst.data()); - x_for_range(tril_triu_computer); - - du->mutable_data(dev_ctx.GetPlace()); - if (u_bst.dims() == du->dims()) { - framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du); - } else { - MatrixReduceSumFunctor functor; - functor(u_bst, du, ctx); - du->Resize(uin->dims()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index f1247ebdf23c8e00cdbfd662a160912a769d7558..2092f65212a6a71534e1ea9a6977abc94bf97b6a 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,9 +1,9 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) -cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn) +cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy device_context parallel_executor cinn) -SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) +SET(CINN_OP_DEPS parallel_executor string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) @@ -11,7 +11,7 @@ if (WITH_TESTING) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda") - cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op elementwise_add_op) + cc_test(cinn_launch_op_test SRCS cinn_launch_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op gflags) set_tests_properties(cinn_launch_op_test PROPERTIES LABELS "RUN_TYPE=CINN" ENVIRONMENT "${CINN_RUN_ENVIRONMENT}") cc_test(cinn_instruction_run_op_test SRCS cinn_instruction_run_op_test.cc DEPS cinn_compiler cinn_launch_op cinn_instruction_run_op elementwise_add_op) diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 7c4bdc09a569e455b20febef278003ada923dd79..0edbee534c0b5d680717250e7702f272eacd0272 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -22,11 +22,17 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif + namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; @@ -50,7 +56,7 @@ TEST(CinnInstructionOpTest, TestWithElementwiseAdd) { auto cinn_instruction_run_op = paddle::framework::OpRegistry::CreateOp( "cinn_instruction_run", {{"X", {"x", "y"}}}, {{"Out", {test_op_out_name}}}, - {{"cached_index", 0}, {"instruction_index", 1}}); + {{"cached_index", 0}, {"instruction_index", 0}}); auto elementwise_add_op = paddle::framework::OpRegistry::CreateOp( "elementwise_add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {add_op_out_name}}}, {{}}); diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 0a21d937aa1a70120e6112cdb291aa41eb222bb3..b76dd60409221eef9204f26319dabb20db4a36ac 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/cinn/cinn_op_helper.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/core/ddim.h" @@ -90,9 +91,30 @@ CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, // Convert the CINN runtime program to a Paddle graph runtime_graph_ = std::make_unique( BuildCompiledProgram(graph, compiled_obj)); - runtime_graph_->SetNotOwned( - kMemOptVarInfoFromMainGraph, - &graph.Get(kMemOptVarInfoFromMainGraph)); + auto& outer_varinfo = graph.Get(kMemOptVarInfoFromMainGraph); + runtime_graph_->SetNotOwned(kMemOptVarInfoFromMainGraph, + &outer_varinfo); + // collect skip_eager_vars + skip_eager_vars_.reserve(input_var_names.size() + output_var_names.size()); + auto add_skip_var_fn = [&outer_varinfo, this](const std::string& var_name) { + // if a var exists at outer_varinfo map, + // that means it can be erased after graph execution + if (!outer_varinfo.count(var_name)) { + skip_eager_vars_.emplace_back(var_name); + } + }; + std::for_each(input_var_names.begin(), input_var_names.end(), + add_skip_var_fn); + std::for_each(output_var_names.begin(), output_var_names.end(), + add_skip_var_fn); + VLOG(4) << string::Sprintf( + "Distribution of variables in the graph compiled:" + "input[%lu],internal[%lu],output[%lu]," + "outer_eager_deletion[%lu],skip_eager_deletion[%lu]," + "initialized_beforehand[%lu]", + input_var_names.size(), internal_var_names_.size(), + output_var_names.size(), outer_varinfo.size(), skip_eager_vars_.size(), + initialized_beforehand_vars_.size()); } void CinnLaunchContext::BuildVarNameMap( @@ -288,6 +310,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( // are set by values of the corresponding compiled tensors, // including the in/out variables where the equiality between their tensors // and the CINN compiled ones is verified in corresponding cinn_launch_op. + std::unordered_set has_refer_vars; for (auto&& arg : cinn_argument_names_) { const std::string& var_name = cinn2paddle_varmap_.at(arg); framework::VarDesc* var_desc = block->Var(var_name); @@ -298,6 +321,7 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ori_desc = res->second; var_desc->SetPersistable(ori_desc->Persistable()); var_desc->SetIsParameter(ori_desc->IsParameter()); + has_refer_vars.insert(var_name); } auto cinn_tensor = GetCinnTensorOfVar(var_name); @@ -331,6 +355,12 @@ framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( auto* ins = instructions.at(ins_idx).get(); auto in_args = trans_and_pack_args_fn(ins->GetInArgs()); auto out_args = trans_and_pack_args_fn(ins->GetOutArgs()); + for (auto&& var_name : in_args) { + if (!has_refer_vars.count(var_name)) { + initialized_beforehand_vars_.emplace_back(var_name); + } + } + has_refer_vars.insert(out_args.begin(), out_args.end()); auto* op_desc = block->AppendOp(); op_desc->SetType("cinn_instruction_run"); @@ -348,16 +378,26 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, framework::Scope* scope) { if (!parallel_executor_) { framework::details::ExecutionStrategy exec_strategy; + exec_strategy.num_threads_ = 1; + exec_strategy.use_device_ = platform::Place2DeviceType(place); framework::details::BuildStrategy build_strategy; parallel_executor_ = std::make_unique( place, scope, exec_strategy, build_strategy, runtime_graph_.get()); } // update the scope bound to an OpHandle and rebuild temporary variables + VLOG(4) << "Reset scope and initialize temporary variables"; std::unordered_map scope_map = { {parallel_executor_->GetLocalScopes().front(), scope}}; parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); parallel_executor_->PrepareVariables(scope); + for (auto&& var_name : initialized_beforehand_vars_) { + auto* var = scope->GetVar(var_name); + auto* buffer = GetCinnBufferOfVar(var_name); + auto dim = framework::DDim(buffer->dims, buffer->dimensions); + var->GetMutable()->Resize(dim); + var->GetMutable()->mutable_data(place); + } return parallel_executor_.get(); } diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index a4d613ea618a886d99344a34ad80aa02e88c10e7..ed5e4383d83d23322860e3f554160013fd5532c9 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -86,6 +86,11 @@ class CinnLaunchContext { void CheckTensorEquivalent(const std::string& var_name, const framework::LoDTensor& paddle_tensor); + // Return the name list of variables skipped eager deletion + const std::vector& GetSkipEagerVars() const { + return skip_eager_vars_; + } + // Return internal variable names list const std::unordered_set& GetInternalVarNames() const { return internal_var_names_; @@ -143,6 +148,9 @@ class CinnLaunchContext { std::unordered_set internal_var_names_; // the names of the cinn arguments used in compiled executable program std::unordered_set cinn_argument_names_; + // TODO(CtfGo): remove this list after fixing batch_norm bug + // due to duplicate association in the same variable. + std::vector initialized_beforehand_vars_; // the variable scope compiled from cinn const std::shared_ptr cinn_scope_; @@ -150,6 +158,8 @@ class CinnLaunchContext { std::unique_ptr runtime_graph_; // a ParallelExecutor to execute the runtime graph std::unique_ptr parallel_executor_; + // the name list of skip_eager_vars in runtime + std::vector skip_eager_vars_; // because a cinn_pod_value_t does not own a cinn_buffer_t object, // an extra stroage is necessary to keep those objects and they can diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index cf3b98c6679b80acad8da69c91addadb9f66ce44..5263aae03ed3f1ab6afa4eb9e6bd38f61858b397 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -103,8 +103,8 @@ class CinnLaunchOpKernel : public framework::OpKernel { details::DebugCinnCompiledResult(cinn_compiled_object); auto* launch_context = cinn_compiled_object.launch_context.get(); - // Step 3. Prepare arguments needed for the compiled executable program. - launch_context->UpdateCapturedEnv(scope, place); + // Step 3. check the computational consistency of the subgraph + // before and after the compilation // 3.1 Input variables: tensors of input variables have // been initialized before graph compiled, just check the // equiality between tensors of paddle and cinn. @@ -120,20 +120,15 @@ class CinnLaunchOpKernel : public framework::OpKernel { *inputs_name2tensor.at(var_name)); } - // 3.2 Output variables: the output variables will be initialized - // and allocated buffer in callbacks which are defined in the - // external_malloc/free interface of cinn_buffer_t - // in their corresponding arguments. - // 3.3 Internal variables: A temporary scope is created in - // UpdateCapturedEnv to keep the internal variables and - // they are also initialized through callbacks - // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); - // Step 5. Launch CINN to execute the compiled executable program - VLOG(4) << "Run Cinn compiled executable program with stream: " << stream; - details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); + // Step 5. use PE to execute the compiled CINN instructions + // in nodes of the runtime graph + VLOG(4) << "Execute the runtime graph by PE"; + framework::Scope& exec_scope = scope.NewScope(); + auto* pe = launch_context->InitializePE(place, &exec_scope); + pe->RunWithoutFetch(launch_context->GetSkipEagerVars()); VLOG(4) << "CinnLaunchOp launch execution done."; } }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index f5b6161ff3462cc1f12df7f59b4709bf19032df2..585f1caabed051134fd5ce7624c17b741b487ef0 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" @@ -25,9 +26,17 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/init.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); +USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +DECLARE_double(eager_delete_tensor_gb); + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif namespace paddle::operators { @@ -61,6 +70,7 @@ TEST(CinnLaunchOpTest, TestWithElementwiseAdd) { CompareOpResult(scope.GetVar(test_op_out_name), scope.GetVar(add_op_out_name)); }; + FLAGS_eager_delete_tensor_gb = -1; // CPU run_and_check_fn(platform::CPUPlace()); diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f29bc57c9a5f4dbbfd53220ce187b386b3025e55 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#endif +#include "paddle/fluid/framework/convert_utils.h" + +namespace paddle { +namespace operators { + +template +class CAllGatherOpMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CNCL) + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + cnclDataType_t dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); + + int nranks = ctx.Attr("nranks"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = platform::CNCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + framework::DDim out_dims = x->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + uint32_t send_numel = x->numel(); + void* send_buff = reinterpret_cast(const_cast(x->data())); + void* recv_buff = reinterpret_cast(out->data()); + + mluStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel, + dtype, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with MLU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index c0968581acda9950aaa8ee2b8f3af15e1db59a67..7206dd01bcaa3e588cc275c2fdf25e70aacc1663 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index 31b00a93f1396564907a7872e919ba6c96f666d8..0946ad8aca65e28835ea1d139fb94c309ce840a1 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 7e5120cd2b392b1eb0698727ccebac485193f6d9..2c4e85400ca4adadce5db1fd318ce2273caa201f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(in->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); out->Resize(in->dims()); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index 9c11704704ed420b14a6ccd9873e0bfbe143b4fe..61e5f27903477972ef10465ccfd6f8de8ce8fba6 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc index d315f211709e4f76c2d5c685721961a91c2102fe..d1e269fb5a4fe9505acf7043bc7a2cea36823ffa 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc @@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel { auto out = ctx.Output("Out"); int numel = x->numel(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(x->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index 5787090e6a52f2f37bd504a904108cd1d24caf5f..cf4d6a28744b368212fe8bcb0924001aa53b5a4e 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index c79b2f92b69a1e6cc5c6f1cf17fa402c671a1997..c4e410d04da5fb5e9b6bfe4d7d5c263084889f54 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index d9a7a4abb08fc883b9b9210fcdefd56af127263a..8b498787c69db0f978acaa68ba63883270e11eb4 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index b8abf458c1c6d395fef08238abaa114ff5dc6e9e..133085ad3f3b0ffd00dbf4d026687b0311116951 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc index bb78971734bf05e94f7b0ebc1f1540b254f98067..36c6f4fadd0fcc9b06c61d5c45ce6829f2d3d977 100644 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index 8f7b8c4a9040be3a2b4540c693c128e92c06a180..6e02d362156970cdee7257c7d00b70cef0519757 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index c40b2c3e76a02ce6e5e754b2dc4280d6917145e7..57e3dd53cc7748fa0fb66e7e934a1c9cd764a15f 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -25,7 +25,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 1da7798ea2696516759ac49b8ce459459e74066b..059fafa3e7f4d4ff0dac7541038d62e03865529f 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -205,8 +205,8 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, - PT_INFER_META(phi::ConcatInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor, + PD_INFER_META(phi::ConcatInferMeta)); REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, ops::ConcatGradOpMaker, diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc index 95135ba3b1a3db156cd80629296481470b11f937..cbec1182f20b886fb4a77847abf7213aec9990a5 100644 --- a/paddle/fluid/operators/conj_op.cc +++ b/paddle/fluid/operators/conj_op.cc @@ -66,8 +66,8 @@ class ConjGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(conj, ConjInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker, ops::ConjGradMaker, ops::ConjGradMaker, diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index a974f2ec335487e0fbc12a578c0d80d6856e418e..0c18522fa32eae5f357da062fbd25fa92878cc08 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -19,6 +19,6 @@ else() target_link_libraries(conditional_block_infer_op conditional_block_op) endif() -file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(less_than);\nUSE_OP_ITSELF(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n") -file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") +file(APPEND ${pybind_file} "USE_OP_ITSELF(bitwise_and);\nUSE_OP_ITSELF(bitwise_or);\nUSE_OP_ITSELF(bitwise_xor);\nUSE_OP_ITSELF(bitwise_not);\n") diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc index 55cab03ea9e3f18f36043848914ac11fac1027c9..4dcbbc8568ff18a1313171f8f66f276d77f019a1 100644 --- a/paddle/fluid/operators/controlflow/bitwise_op.cc +++ b/paddle/fluid/operators/controlflow/bitwise_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/bitwise_op.h" #include #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" namespace paddle { namespace operators { @@ -75,11 +75,19 @@ It operates ``%s`` on Tensor ``X`` . } }; -class BitwiseOp : public framework::OperatorWithKernel { +template +class UnaryBitwiseOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; protected: + void InferShape(framework::InferShapeContext *context) const override { + OpComment comment; + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); @@ -90,23 +98,9 @@ class BitwiseOp : public framework::OperatorWithKernel { }; template -class UnaryBitwiseOp : public BitwiseOp { - public: - using BitwiseOp::BitwiseOp; - - protected: - void InferShape(framework::InferShapeContext *context) const override { - OpComment comment; - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); - context->SetOutputDim("Out", context->GetInputDim("X")); - context->ShareLoD("X", "Out"); - } -}; - -template -class BinaryBitwiseOp : public BitwiseOp { +class BinaryBitwiseOp : public framework::OperatorWithKernel { public: - using BitwiseOp::BitwiseOp; + using framework::OperatorWithKernel::OperatorWithKernel; protected: void InferShape(framework::InferShapeContext *context) const override { @@ -130,6 +124,14 @@ class BinaryBitwiseOp : public BitwiseOp { } context->ShareLoD("X", "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // BitwiseOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } }; } // namespace operators @@ -167,8 +169,3 @@ REGISTER_BINARY_BITWISE_OP(bitwise_and, "Out = X \\& Y"); REGISTER_BINARY_BITWISE_OP(bitwise_or, "Out = X | Y"); REGISTER_BINARY_BITWISE_OP(bitwise_xor, "Out = X ^\\wedge Y"); REGISTER_UNARY_BITWISE_OP(bitwise_not, "Out = \\sim X"); - -REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CPU, ops::BitwiseAndFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CPU, ops::BitwiseOrFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CPU, ops::BitwiseXorFunctor); -REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CPU, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu deleted file mode 100644 index 5d98da2c027fb6ee681bbea3980f1dbf631d6431..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/bitwise_op.cu +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/bitwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace paddle { -namespace operators { - -template -class BinaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using T = typename Functor::ELEM_TYPE; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto functor = Functor(); - std::vector ins = {x, y}; - std::vector outs = {out}; - const auto& cuda_ctx = - ctx.template device_context(); - paddle::operators::LaunchElementwiseCudaKernel(cuda_ctx, ins, &outs, -1, - functor); - } -}; - -template -class UnaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using T = typename Functor::ELEM_TYPE; - - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto functor = Functor(); - std::vector ins = {x}; - std::vector outs = {out}; - const auto& cuda_ctx = - ctx.template device_context(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(cuda_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = ::paddle::operators; -namespace plat = ::paddle::platform; - -REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrFunctor); -REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorFunctor); -REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h deleted file mode 100644 index 9e652f92007479684fcf8ec5e539312d8d729107..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/bitwise_op.h +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ - template \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = T; \ - HOSTDEVICE T operator()(const T a, const T b) const { return a expr b; } \ - }; \ - \ - template <> \ - struct Bitwise##func##Functor { \ - using ELEM_TYPE = bool; \ - HOSTDEVICE bool operator()(const bool a, const bool b) const { \ - return a bool_expr b; \ - } \ - }; - -BITWISE_BINARY_FUNCTOR(And, &, &&) -BITWISE_BINARY_FUNCTOR(Or, |, ||) -BITWISE_BINARY_FUNCTOR(Xor, ^, !=) -#undef BITWISE_BINARY_FUNCTOR - -template -struct BitwiseNotFunctor { - using ELEM_TYPE = T; - HOSTDEVICE T operator()(const T a) const { return ~a; } -}; - -template <> -struct BitwiseNotFunctor { - using ELEM_TYPE = bool; - HOSTDEVICE bool operator()(const bool a) const { return !a; } -}; - -template -class BinaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - auto func = Functor(); - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - ElementwiseComputeEx(context, x, y, -1, func, - out); - } -}; - -template -class UnaryBitwiseOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - auto func = Functor(); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - platform::Transform trans; - trans(context.template device_context(), x->data(), - x->data() + x->numel(), out->mutable_data(context.GetPlace()), - func); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = ::paddle::operators; -namespace plat = ::paddle::platform; - -#define REGISTER_BINARY_BITWISE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>, \ - ops::BinaryBitwiseOpKernel>); - -#define REGISTER_UNARY_BITWISE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>, \ - ops::UnaryBitwiseOpKernel>); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc index ede349f737d899e5f04cb5e35d1dbc0c0abc2403..dd407f4f6f3c51ef99cb09f08ef7fdca5b1e10bc 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cc +++ b/paddle/fluid/operators/controlflow/compare_all_op.cc @@ -12,49 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_all_op.h" -#include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { -template -class CompareReduceOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - Tensor tmp; - bool* z_data = z->mutable_data(context.GetPlace()); - - if (x->dims() != y->dims()) { - z_data[0] = false; - } else { - tmp.mutable_data(x->dims(), context.GetPlace()); - if (x->numel() == 1 && y->numel() == 1) { - bool* z_data = tmp.mutable_data(context.GetPlace()); - z_data[0] = Functor()(x->data()[0], y->data()[0]); - } else { - ElementwiseComputeEx( - context, x, y, 0, Functor(), &tmp); - } - auto ipt = framework::EigenVector::Flatten(tmp); - auto out = framework::EigenScalar::From(*z); - auto& place = - *context.template device_context() - .eigen_device(); - auto reduce_dim = Eigen::array({{0}}); - out.device(place) = ipt.all(reduce_dim); - } - } -}; - template class CompareReduceOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: @@ -81,26 +46,6 @@ template class CompareReduceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* context) const override { - OpComment comment; - PADDLE_ENFORCE_EQ(context->HasInput("X"), true, - platform::errors::InvalidArgument( - "%s operator must have input X", comment.type)); - PADDLE_ENFORCE_EQ(context->HasInput("Y"), true, - platform::errors::InvalidArgument( - "%s operator must have input Y", comment.type)); - auto dim_x = context->GetInputDim("X"); - auto dim_y = context->GetInputDim("Y"); - PADDLE_ENFORCE_GE( - dim_x.size(), dim_y.size(), - platform::errors::InvalidArgument( - "The size of dim_y should not be greater than dim_x's.")); - - context->SetOutputDim("Out", {1}); - context->ShareLoD("X", "Out"); - } }; } // namespace operators @@ -113,25 +58,13 @@ class CompareReduceOp : public framework::OperatorWithKernel { }; \ char _##op_type##Comment::type[]{#op_type}; \ char _##op_type##Comment::equation[]{_equation}; \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareAllInferMeta)); \ REGISTER_OPERATOR( \ op_type, ::paddle::operators::CompareReduceOp<_##op_type##Comment>, \ ::paddle::operators::CompareReduceOpProtoMaker<_##op_type##Comment>, \ ::paddle::framework::EmptyGradOpMaker, \ - ::paddle::framework::EmptyGradOpMaker); + ::paddle::framework::EmptyGradOpMaker, \ + op_type##_InferShapeFunctor); -#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor) \ - REGISTER_OP_CPU_KERNEL( \ - op_type, ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>); REGISTER_COMPARE_REDUCE_OP(equal_all, "X == Y"); - -REGISTER_COMPARE_REDUCE_CPU_KERNEL(equal_all, - paddle::operators::EqualReduceFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu deleted file mode 100644 index d96dcebe51f97f1a3a954966aeb3663ff1f7a819..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/operators/controlflow/compare_all_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" - -namespace paddle { -namespace operators { - -template -struct BitwiseAdd { - // Bitwise add operator, returns a + b - inline T initial() { return static_cast(true); } - - __host__ __device__ __forceinline__ T operator()(const T& a, - const T& b) const { - return a & b; - } -}; - -template -class CompareReduceOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - bool* z_data = z->mutable_data(context.GetPlace()); - Tensor tmp; - - if (x->dims() != y->dims()) { - thrust::device_ptr z_dev_ptr(z_data); - thrust::fill(z_dev_ptr, z_dev_ptr + 1, false); - return; - } else { - tmp.mutable_data(x->dims(), context.GetPlace()); - const auto& cuda_ctx = - context.template device_context(); - std::vector ins = {x, y}; - std::vector outs = {&tmp}; - paddle::operators::LaunchSameDimsElementwiseCudaKernel( - cuda_ctx, ins, &outs, Functor()); - - // Reduce by 'bitwise and' operator - std::vector reduce_dims; - reduce_dims.resize(tmp.dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - auto stream = context.cuda_device_context().stream(); - TensorReduceImpl>( - context.cuda_device_context(), tmp, z, kps::IdentityFunctor(), - reduce_dims, stream); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor) \ - REGISTER_OP_CUDA_KERNEL( \ - op_type, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>, \ - ops::CompareReduceOpKernel>); - -REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, EqualReduceFunctor) -#undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_all_op.h b/paddle/fluid/operators/controlflow/compare_all_op.h deleted file mode 100644 index 78a7b76e3fd9d03f2381dfb13f90c191d1dca4f8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_all_op.h +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -template -struct EqualReduceFunctor { - using ELEM_TYPE = T; - HOSTDEVICE bool operator()(const T a, const T b) const { - if (std::is_floating_point::value) { - // This branch will be optimized while compiling if T is integer. It is - // safe to cast a and b to double. - return fabs(static_cast(a - b)) < 1e-8; - } else { - return (a == b); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 657e74398bb24bb4c2a5514bbb1656126591ee4e..72d81d8c3fdf2827da9b8362cee80ecbb16e4484 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include -#include -#include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/phi/common/place.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -60,31 +58,6 @@ class CompareOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext* context) const override { - OpComment comment; - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); - OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type); - auto dim_x = context->GetInputDim("X"); - auto dim_y = context->GetInputDim("Y"); - - if (context->GetInputDim("X") == context->GetInputDim("Y")) { - context->ShareDim("X", /*->*/ "Out"); - context->ShareLoD("X", /*->*/ "Out"); - } else { - int max_dim = std::max(dim_x.size(), dim_y.size()); - int axis = std::abs(dim_x.size() - dim_y.size()); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(), - y_dims_array.data(), out_dims_array.data(), - max_dim, axis); - context->SetOutputDim("Out", phi::make_ddim(out_dims_array)); - // to do - context->ShareLoD("X", /*->*/ "Out"); - } - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); @@ -116,37 +89,31 @@ class CompareOp : public framework::OperatorWithKernel { "In order to force fill output variable to gpu memory.", \ false)); -#define REGISTER_COMPARE_OP(op_type, _equation) \ - struct _##op_type##Comment { \ - static char type[]; \ - static char equation[]; \ - }; \ - char _##op_type##Comment::type[]{#op_type}; \ - char _##op_type##Comment::equation[]{_equation}; \ - REGISTER_OPERATOR( \ - op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ - ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ - ::paddle::framework::EmptyGradOpMaker, \ - ::paddle::framework::EmptyGradOpMaker); \ +#define REGISTER_COMPARE_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + DECLARE_INFER_SHAPE_FUNCTOR(op_type, op_type##_InferShapeFunctor, \ + PD_INFER_META(phi::CompareInferMeta)); \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::CompareOp<_##op_type##Comment>, \ + ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker, \ + ::paddle::framework::EmptyGradOpMaker, \ + op_type##_InferShapeFunctor); \ REGISTER_COMPARE_OP_VERSION(op_type); REGISTER_COMPARE_OP(less_than, "Out = X < Y"); -REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor, - paddle::operators::GreaterThanFunctor); + REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); -REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor, - paddle::operators::GreaterEqualFunctor); + REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); -REGISTER_COMPARE_KERNEL(greater_than, CPU, - paddle::operators::GreaterThanFunctor, - paddle::operators::LessThanFunctor); + REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); -REGISTER_COMPARE_KERNEL(greater_equal, CPU, - paddle::operators::GreaterEqualFunctor, - paddle::operators::LessEqualFunctor); + REGISTER_COMPARE_OP(equal, "Out = X == Y"); -REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor, - paddle::operators::EqualFunctor); + REGISTER_COMPARE_OP(not_equal, "Out = X != Y"); -REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor, - paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu deleted file mode 100644 index 4b9452d0f60e0396e4bc50bb5ea56e2f3131098e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -class CompareOpKernel - : public framework::OpKernel { - public: - using InT = typename Functor::ELEM_TYPE; - using OutT = bool; - void Compute(const framework::ExecutionContext& ctx) const override { - auto functor = Functor(); - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - paddle::operators::LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, functor); - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func) \ - REGISTER_OP_CUDA_KERNEL( \ - op_type, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>, \ - ops::CompareOpKernel, void>); - -REGISTER_CUDA_COMPARE_KERNEL(equal, EqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(not_equal, NotEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_than, LessThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(less_equal, LessEqualFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_than, GreaterThanFunctor) -REGISTER_CUDA_COMPARE_KERNEL(greater_equal, GreaterEqualFunctor) -#undef REGISTER_CUDA_COMPARE_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h deleted file mode 100644 index be017a01ef3237fd8572e248d691daa97c999509..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -#define COMPARE_FUNCTOR(func_name, op) \ - template \ - struct func_name { \ - using ELEM_TYPE = InT; \ - HOSTDEVICE OutT operator()(const InT a, const InT b) const { \ - return static_cast(a op b); \ - } \ - }; - -COMPARE_FUNCTOR(LessThanFunctor, <) -COMPARE_FUNCTOR(LessEqualFunctor, <=) -COMPARE_FUNCTOR(GreaterThanFunctor, >) -COMPARE_FUNCTOR(GreaterEqualFunctor, >=) -#undef COMPARE_FUNCTOR - -template -struct EqualFunctor { - using ELEM_TYPE = InT; - HOSTDEVICE OutT operator()(const InT a, const InT b) const { - if (std::is_floating_point::value) { - // This branch will be optimized while compiling if T is integer. It is - // safe to cast a and b to double. - return static_cast(fabs(static_cast(a - b)) < 1e-8); - } else { - return static_cast(a == b); - } - } -}; - -template -struct NotEqualFunctor { - using ELEM_TYPE = InT; - HOSTDEVICE bool operator()(const InT a, const InT b) const { - return !EqualFunctor()(a, b); - } -}; - -template -class CompareOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - int axis = context.Attr("axis"); - - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx(context, x, y, axis, - Functor(), z); - } else { - ElementwiseComputeEx( - context, x, y, axis, InverseFunctor(), z); - } - } -}; - -} // namespace operators -} // namespace paddle - -#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor) \ - REGISTER_OP_##dev##_KERNEL(op_type, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, \ - functor, inverse_functor>); diff --git a/paddle/fluid/operators/controlflow/compare_op_mlu.cc b/paddle/fluid/operators/controlflow/compare_op_mlu.cc index 9dc287ab76a67c6026ec8794793e77179063af3d..c39743ef9914c039f13428d43a66b1aa66ada0ed 100644 --- a/paddle/fluid/operators/controlflow/compare_op_mlu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_mlu.cc @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc index 7bc4ca09771355361d8106421dc57601b94c88f1..7377d7cf8d312c4f4f405235b21b372b1a7a738c 100644 --- a/paddle/fluid/operators/controlflow/compare_op_npu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -11,7 +11,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc index 698bd0516133861523f8d2b353abfeace4665840..2de8b4c9ba880e089bb4eaa4fa8df3bedb69b55b 100644 --- a/paddle/fluid/operators/controlflow/compare_op_xpu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 3bbb284ca821b8576f2752446555f146c16bb189..4e6fda3d09a071f59c97c87315619d126497a756 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -53,12 +54,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::DSizes(); @@ -171,11 +171,10 @@ void ChooseAlgo(const std::vector& perf_results, using framework::ConvSearchCache; -static void SetConvMathType(const framework::ExecutionContext& ctx, - cudnnDataType_t dtype, +static void SetConvMathType(const phi::GPUContext& ctx, cudnnDataType_t dtype, const platform::ConvolutionDescriptor& cdesc) { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) - auto& dev_ctx = ctx.template device_context(); + auto& dev_ctx = ctx; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( cdesc.desc(), CUDNN_TENSOR_OP_MATH)); @@ -231,8 +230,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; bool has_got_workspace_size = true; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -284,8 +282,7 @@ struct SearchAlgorithm { } else if (deterministic) { algo = static_cast(1); } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -346,8 +343,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -413,8 +409,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = @@ -478,8 +473,7 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, - const framework::ExecutionContext& ctx) { + bool deterministic, const phi::GPUContext& ctx) { platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; @@ -534,8 +528,7 @@ struct SearchAlgorithm { } else if (deterministic) { return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; } else { - auto& dev_ctx = - ctx.template device_context(); + auto& dev_ctx = ctx; auto workspace_handle = dev_ctx.cudnn_workspace_handle(); AlgorithmsCache& algo_cache = *(framework::ConvSearchCache::Instance().GetBackwardFilter()); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu deleted file mode 100644 index dff60afd74c02f458b5b3c7428c2703197b61af0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ /dev/null @@ -1,1476 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the spopecific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/memory/memory.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/operators/conv_miopen_helper.h" -#else -#include "paddle/fluid/operators/conv_cudnn_helper.h" -#endif -#include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" -#include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/profiler/event_tracing.h" - -DECLARE_bool(cudnn_deterministic); -DECLARE_uint64(conv_workspace_size_limit); -DECLARE_bool(cudnn_exhaustive_search); - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; -using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; -using DataLayout = platform::DataLayout; - -static inline bool IsVoltaOrLater(const platform::CUDADeviceContext& dev_ctx) { - return dev_ctx.GetComputeCapability() >= 70; -} - -template -class CUDNNConvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - const Tensor* input = ctx.Input("Input"); - auto* filter = ctx.Input("Filter"); - auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - // Tensor Core introduced from Volta GPUs supports more faster conv op - // with FP16 in NHWC data format. - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - // We will only do data format conversion from NHWC to NCHW. - // cudnn will convert NCHW to NHWC automatically on Tensor Core. - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // ------------ transformed tensor ----------- - Tensor transformed_input_channel(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_filter_channel(filter->type()); - T* output_data = nullptr; - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst(ctx, output, - &transformed_output); - - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output.ShareDataWith(*output); - } - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - } else { - transformed_filter_channel.ShareDataWith(*filter); - } - output_data = transformed_output.data(); - - // update padding and dilation - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); - - Tensor transformed_input; - std::vector padding_common(data_dim, 0); - if (!is_sys_pad) { - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - std::vector input_pad(transformed_input_channel.dims().size() * 2, - 0); - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* filter_data = transformed_filter_channel.data(); - - // ------------------- cudnn descriptors --------------------- - ConvArgs args{&transformed_input, - &transformed_filter_channel, - &transformed_output, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_format = GetCudnnTensorFormat(layout); - - args.handle = handle; - -#ifdef PADDLE_WITH_HIP - // MIOPEN need to set groups in cdesc in miopen_desc.h - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), groups); -#else - args.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn()); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // cudnn 7 can support groups, no need to do it manually - // FIXME(typhoonzero): find a better way to disable groups - // rather than setting it to 1. - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionGroupCount( - args.cdesc.desc(), groups)); - groups = 1; -#endif -#ifdef PADDLE_WITH_HIP - // MIOPEN do not set groups in wdesc after set groups in cdesc - groups = 1; -#endif - args.idesc.set(transformed_input, layout_format); - args.wdesc.set(transformed_filter_channel, layout_format, groups); - args.odesc.set(transformed_output, layout_format); - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNHWC, &o_n, &o_c, &o_d, - &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; - // ------------------- cudnn conv workspace --------------------- - size_t workspace_size = 0; // final workspace to allocate. -// ------------------- cudnn conv algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t algo{}; - using search = SearchAlgorithm; - workspace_size = search::GetWorkspaceSize(args); - algo = search::Find(args, exhaustive_search, deterministic, - workspace_size, ctx); -#else - cudnnConvolutionFwdAlgo_t algo{}; - using search = SearchAlgorithm; - algo = search::Find(args, exhaustive_search, deterministic, ctx); - workspace_size = search::GetWorkspaceSize(args, algo); -#endif - -#if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) - // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ - // FWD_ALGO_WINOGRAD_NONFUSED, but this kind of algorithm is unstable - // in forward computation, so change the algorithm to CUDNN_CONVOLUTION_\ - // FWD_ALGO_IMPLICIT_GEMM manually. - if (ctx.Attr("groups") > 1) { - algo = static_cast(0); - } -#endif - - // ------------------- cudnn conv forward --------------------- - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - -// NOTE(zhiqiu): inplace addto is not supportted in double grad yet. -// ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; -// VLOG(4) << "Conv: use_addto = " << ctx.Attr("use_addto"); - -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args.idesc.desc(), input_data, - args.wdesc.desc(), filter_data, args.cdesc.desc(), algo, - &beta, args.odesc.desc(), output_data, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args.idesc.desc(), - input_data + i * group_offset_in, args.wdesc.desc(), - filter_data + i * group_offset_filter, args.cdesc.desc(), - algo, workspace_ptr, workspace_size, &beta, - args.odesc.desc(), output_data + i * group_offset_out)); - }, - workspace_size); - } -#endif - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_output, output); - } - } -}; - -template -class CUDNNConvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto input = ctx.Input("Input"); - auto filter = ctx.Input("Filter"); - auto output_grad = ctx.Input(framework::GradVarName("Output")); - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto filter_grad = ctx.Output(framework::GradVarName("Filter")); - - if (input_grad) { - input_grad->mutable_data(ctx.GetPlace()); - } - if (filter_grad) { - filter_grad->mutable_data(ctx.GetPlace()); - } - - std::vector dilations = ctx.Attr>("dilations"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - // HIP MIOPEN ONLY SUPPORT NCHW format - auto compute_format = DataLayout::kNCHW; -#else - const bool compute_in_nhwc = - dtype == CUDNN_DATA_HALF && IsVoltaOrLater(dev_ctx); - auto compute_format = - compute_in_nhwc && channel_last ? DataLayout::kNHWC : DataLayout::kNCHW; -#endif - VLOG(3) << "Compute ConvGradOp with cuDNN:" - << " data_format=" << data_format << " compute_format=" - << (compute_format == DataLayout::kNHWC ? "NHWC" : "NCHW"); - - // transform Tensor - Tensor transformed_input_channel(input->type()); - Tensor transformed_output_grad_channel(output_grad->type()); - Tensor transformed_input_grad_channel(input->type()); - Tensor transformed_filter_channel(filter->type()); - Tensor transformed_filter_grad_channel(filter->type()); - - if (channel_last && compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform input, output_grad, input_grad and tensor from " - "NHWC to NCHW."; - ResizeToChannelFirst( - ctx, input, &transformed_input_channel); - TransToChannelFirst( - ctx, input, &transformed_input_channel); - - ResizeToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - TransToChannelFirst( - ctx, output_grad, &transformed_output_grad_channel); - - if (input_grad) { - ResizeToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy - // the data of input_grad to transformed_input_grad_channel. - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - TransToChannelFirst( - ctx, input_grad, &transformed_input_grad_channel); - } - } - } else { - transformed_input_channel.ShareDataWith(*input); - transformed_output_grad_channel.ShareDataWith(*output_grad); - if (input_grad) { - transformed_input_grad_channel.ShareDataWith(*input_grad); - } - } - - if (compute_format == DataLayout::kNHWC) { - VLOG(3) << "Transform filter and filter_grad tensor from NCHW to NHWC."; - ResizeToChannelLast( - ctx, filter, &transformed_filter_channel); - TransToChannelLast( - ctx, filter, &transformed_filter_channel); - - if (filter_grad) { - ResizeToChannelLast( - ctx, filter_grad, &transformed_filter_grad_channel); - } - } else { - transformed_filter_channel.ShareDataWith(*filter); - if (filter_grad) { - transformed_filter_grad_channel.ShareDataWith(*filter_grad); - } - } - - // update paddings - auto in_dims = transformed_input_channel.dims(); - auto filter_dims = transformed_filter_channel.dims(); - framework::DDim in_data_dims; - framework::DDim filter_data_dims; - if (compute_format == DataLayout::kNCHW) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - filter_data_dims = phi::slice_ddim(filter_dims, 2, filter_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - filter_data_dims = - phi::slice_ddim(filter_dims, 1, filter_dims.size() - 1); - } - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - // cuDNN only supports padding the same amount on every dimension. - // So we create a new padded input tensor. - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_input(input->type()); - Tensor transformed_input_grad(input->type()); - std::vector padding_common(data_dim, 0); - std::vector input_pad(transformed_input_channel.dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_input_channel.dims()[0]; - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[1] = transformed_input_channel.dims()[1]; - } else { - new_input_shape_vec[data_dim + 1] = - transformed_input_channel.dims()[data_dim + 1]; - } - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - if (compute_format == DataLayout::kNCHW) { - new_input_shape_vec[i + 2] = - transformed_input_channel.dims()[i + 2] + padding_diff[i]; - } else { - new_input_shape_vec[i + 1] = - transformed_input_channel.dims()[i + 1] + padding_diff[i]; - } - if (compute_format == DataLayout::kNCHW) { - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } else { - input_pad[2 * i + 2] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 2 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_input.Resize(new_input_shape); - - transformed_input_grad.Resize(new_input_shape); - auto& dev_ctx = - ctx.template device_context(); - - transformed_input = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (input_grad) { - transformed_input_grad = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - // pad for input - const int rank = transformed_input_channel.dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, - &transformed_input); - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - } else { - transformed_input.ShareDataWith(transformed_input_channel); - if (input_grad) { - transformed_input_grad.ShareDataWith(transformed_input_grad_channel); - } - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* input_data = transformed_input.data(); - const T* output_grad_data = transformed_output_grad_channel.data(); - const T* filter_data = transformed_filter_channel.data(); - T* filter_grad_data = nullptr; - T* input_grad_data = nullptr; - T* transformed_input_grad_data = nullptr; - - ConvArgs args1{&transformed_input_grad, - &transformed_filter_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{&transformed_input, - &transformed_filter_grad_channel, - &transformed_output_grad_channel, - strides, - padding_common, - dilations, - dtype}; - - auto handle = dev_ctx.cudnn_handle(); - DataLayout layout = compute_format == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - if (transformed_input.dims().size() == 5) { - layout = compute_format == DataLayout::kNHWC ? DataLayout::kNDHWC - : DataLayout::kNCDHW; - } - auto layout_tensor = GetCudnnTensorFormat(layout); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - - int i_n, i_c, i_d, i_h, i_w; - int o_n, o_c, o_d, o_h, o_w; - if (compute_format == DataLayout::kNHWC) { - GetNCDHW(transformed_input.dims(), DataLayout::kNHWC, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNHWC, &o_n, - &o_c, &o_d, &o_h, &o_w); - } else { - GetNCDHW(transformed_input.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, - &i_h, &i_w); - GetNCDHW(transformed_output_grad_channel.dims(), DataLayout::kNCHW, &o_n, - &o_c, &o_d, &o_h, &o_w); - } - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = transformed_filter_channel.numel() / groups; -// ------------------- cudnn backward algorithm --------------------- -#ifdef PADDLE_WITH_HIP - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - // input data workspace_size - size_t workspace_size_d = 0; - // weight workspace_size - size_t workspace_size_w = 0; - int iwo_groups = groups; - int c_groups = 1; - -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_groups = 1; - c_groups = groups; - groups = 1; -#endif - - if (input_grad) { - // ------------------- cudnn descriptors --------------------- - input_grad_data = input_grad->data(); - transformed_input_grad_data = transformed_input_grad.data(); - args1.handle = handle; - args1.idesc.set(transformed_input_grad, layout_tensor); - args1.wdesc.set(transformed_filter_channel, layout_tensor, iwo_groups); - args1.odesc.set(transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size_d = - std::max(workspace_size_d, search1::GetWorkspaceSize(args1)); - data_algo = search1::Find(args1, exhaustive_search, deterministic, - workspace_size_d, ctx); -#else - using search1 = SearchAlgorithm; - data_algo = - search1::Find(args1, exhaustive_search, deterministic, ctx); - workspace_size_d = std::max(workspace_size_d, - search1::GetWorkspaceSize(args1, data_algo)); -#endif - } - - if (filter_grad) { - // ------------------- cudnn descriptors --------------------- - filter_grad_data = transformed_filter_grad_channel.data(); - args2.handle = handle; - args2.idesc.set(transformed_input, layout_tensor); - args2.wdesc.set(transformed_filter_grad_channel, layout_tensor, - iwo_groups); - args2.odesc.set(transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_groups); -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size_w = - std::max(workspace_size_w, search2::GetWorkspaceSize(args2)); - filter_algo = search2::Find(args2, exhaustive_search, deterministic, - workspace_size_w, ctx); -#else - using search2 = SearchAlgorithm; - filter_algo = - search2::Find(args2, exhaustive_search, deterministic, ctx); - workspace_size_w = std::max( - workspace_size_w, search2::GetWorkspaceSize(args2, filter_algo)); -#endif - } - - // ------------------- cudnn conv backward data --------------------- - ScalingParamType alpha = 1.0f; -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - ScalingParamType beta = 0.0f; -#else - ScalingParamType beta = - (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) ? 1.0f : 0.0f; -#endif - VLOG(4) << "Conv_grad: use_addto = " - << (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")); - - if (input_grad) { -// When beta is 0, it is unnecessary to reset input_grad. -// When beta is 1, the output cannot be reset since addt strategy used. -#ifdef PADDLE_WITH_HIP - if (ctx.HasAttr("use_addto") && ctx.Attr("use_addto")) { - Tensor temp_tensor(transformed_input_grad.type()); - temp_tensor.Resize(transformed_input_grad.dims()); - T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), temp_tensor_data, - cudnn_workspace_ptr, workspace_size_d)); - }, - workspace_size_d); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenOpTensor( - handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), - transformed_input_grad_data, &alpha, args1.idesc.desc(), - temp_tensor_data, &beta, args1.idesc.desc(), - transformed_input_grad_data)); - } else { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size_d)); - }, - workspace_size_d); - } - -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args1.wdesc.desc(), - filter_data + i * group_offset_filter, args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.cdesc.desc(), data_algo, cudnn_workspace_ptr, - workspace_size_d, &beta, args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in)); - }, - workspace_size_d); - } -#endif - if (!is_sys_pad) { - std::vector starts(transformed_input_channel.dims().size(), 0); - std::vector axes(transformed_input_channel.dims().size(), 0); - - for (size_t i = 0; i < transformed_input_channel.dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - - transformed_input_grad_channel.mutable_data(ctx.GetPlace()); - if (transformed_input_channel.dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_input_grad, &transformed_input_grad_channel, - starts, axes); - } - } - - if (channel_last && compute_format == DataLayout::kNCHW) { - TransToChannelLast( - ctx, &transformed_input_grad_channel, input_grad); - } - } - - // filter_grad do not use inplace addto. - ScalingParamType beta_filter = 0.0f; - // ------------------- cudnn conv backward filter --------------------- - if (filter_grad) { -// Because beta is zero, it is unnecessary to reset filter_grad. -#ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), output_grad_data, - args2.idesc.desc(), input_data, args2.cdesc.desc(), - filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, - cudnn_workspace_ptr, workspace_size_w)); - }, - workspace_size_w); -#else - for (int i = 0; i < groups; i++) { - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args2.idesc.desc(), - input_data + i * group_offset_in, args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.cdesc.desc(), filter_algo, cudnn_workspace_ptr, - workspace_size_w, &beta_filter, args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter)); - }, - workspace_size_w); - } -#endif - - if (compute_format == DataLayout::kNHWC) { - TransToChannelFirst( - ctx, &transformed_filter_grad_channel, filter_grad); - } - } - } -}; - -/* - * Inputs: I, W, dO, ddI, ddW - * Outputs: ddO, dW, dI - * ddo = conv(ddI, W) + conv(I, ddW) - * dW = conv_bp_filter(ddI, dO) - * dI = conv_bp_data(ddW, dO) - */ -template -class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CUDAPlace.")); - auto X = ctx.Input("Input"); - auto W = ctx.Input("Filter"); - auto dO = ctx.Input("DOutput"); - auto ddX = ctx.Input("DDInput"); - auto ddW = ctx.Input("DDFilter"); - - auto ddO = ctx.Output("DDOutput"); - auto dW = ctx.Output("DFilter"); - auto dX = ctx.Output("DInput"); - if (ddO) { - ddO->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, ddO, static_cast(0)); - } - if (dW) { - dW->mutable_data(ctx.GetPlace()); - } - if (dX) { - dX->mutable_data(ctx.GetPlace()); - } - - // const T* x = X->data(); - const T* dy = dO->data(); - const T* w = W->data(); - - const T* ddx = nullptr; - const T* ddw = nullptr; - T *dw, *dx, *ddy; - dw = dx = ddy = nullptr; - T* transformed_dx = nullptr; - const std::vector& strides = ctx.Attr>("strides"); - std::vector dilations = ctx.Attr>("dilations"); - int groups = ctx.Attr("groups"); - - bool exhaustive_search = - FLAGS_cudnn_exhaustive_search || (ctx.HasAttr("exhaustive_search") && - ctx.Attr("exhaustive_search")); - bool deterministic = FLAGS_cudnn_deterministic; - auto exhaustive_deterministic = exhaustive_search && deterministic; - PADDLE_ENFORCE_EQ(exhaustive_deterministic, false, - platform::errors::InvalidArgument( - "Cann't set exhaustive_search True and " - "FLAGS_cudnn_deterministic True at same time.")); - - std::vector paddings = ctx.Attr>("paddings"); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensors to channel first----------- - Tensor transformed_X_channel(X->type()); - Tensor transformed_dO_channel(dO->type()); - Tensor transformed_ddX_channel(X->type()); - - Tensor transformed_ddO_channel(dO->type()); - Tensor transformed_dX_channel(X->type()); - - if (channel_last) { - ResizeToChannelFirst( - ctx, X, &transformed_X_channel); - TransToChannelFirst( - ctx, X, &transformed_X_channel); - - ResizeToChannelFirst( - ctx, dO, &transformed_dO_channel); - TransToChannelFirst( - ctx, dO, &transformed_dO_channel); - - if (ddX) { - ResizeToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - TransToChannelFirst( - ctx, ddX, &transformed_ddX_channel); - } - - if (ddO) { - ResizeToChannelFirst( - ctx, ddO, &transformed_ddO_channel); - } - if (dX) { - ResizeToChannelFirst( - ctx, dX, &transformed_dX_channel); - transformed_dX_channel.mutable_data(ctx.GetPlace()); - } - - } else { - transformed_X_channel = *X; - transformed_dO_channel = *dO; - if (ddX) { - transformed_ddX_channel = *ddX; - } - if (ddO) { - transformed_ddO_channel.ShareDataWith(*ddO); - } - if (dX) { - transformed_dX_channel.ShareDataWith(*dX); - } - } - - auto in_dims = transformed_X_channel.dims(); - auto filter_dims = W->dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); - Tensor transformed_X(X->type()); - Tensor transformed_ddX(X->type()); - - Tensor transformed_dX(X->type()); - - std::vector padding_common(data_dim, 0); - std::vector input_pad(X->dims().size() * 2, 0); - - if (!is_sys_pad) { - // get pad - std::vector padding_diff(data_dim); - std::vector new_input_shape_vec(data_dim + 2); - new_input_shape_vec[0] = transformed_X_channel.dims()[0]; - new_input_shape_vec[1] = transformed_X_channel.dims()[1]; - - for (size_t i = 0; i < data_dim; ++i) { - padding_diff[i] = std::abs(paddings[2 * i] - paddings[2 * i + 1]); - padding_common[i] = std::min(paddings[2 * i], paddings[2 * i + 1]); - new_input_shape_vec[i + 2] = - transformed_X_channel.dims()[i + 2] + padding_diff[i]; - input_pad[2 * i + 4] = paddings[2 * i] - padding_common[i]; - input_pad[2 * i + 4 + 1] = paddings[2 * i + 1] - padding_common[i]; - } - framework::DDim new_input_shape(phi::make_ddim(new_input_shape_vec)); - transformed_X.Resize(new_input_shape); - transformed_ddX.Resize(new_input_shape); - transformed_dX.Resize(new_input_shape); - - transformed_X = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - if (ddX) { - transformed_ddX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - if (dX) { - transformed_dX = - ctx.AllocateTmpTensor( - new_input_shape, dev_ctx); - } - - // pad for input - const int rank = X->dims().size(); - T pad_value(0.0); - switch (rank) { - case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); - if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, - &transformed_ddX); - } - } break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "ConvOp only support tensors with 4 or 5 dimensions.")); - } - - } else { - transformed_X.ShareDataWith(transformed_X_channel); - if (ddX) { - transformed_ddX.ShareDataWith(transformed_ddX_channel); - } - if (dX) { - transformed_dX.ShareDataWith(transformed_dX_channel); - } - - if (paddings.size() == data_dim) { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[i]; - } - } else { - for (size_t i = 0; i < data_dim; ++i) { - padding_common[i] = paddings[2 * i]; - } - } - } - - const T* x = transformed_X.data(); - - int iwo_group = groups; - int c_group = 1; -#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) - iwo_group = 1; - c_group = groups; - groups = 1; -#endif - auto dtype = platform::CudnnDataType::type; - - auto handle = dev_ctx.cudnn_handle(); - - ConvArgs args1{&transformed_ddX, - W, - &transformed_ddO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args2{ - &transformed_X, ddW, &transformed_ddO_channel, strides, padding_common, - dilations, dtype}; - ConvArgs args3{&transformed_ddX, - dW, - &transformed_dO_channel, - strides, - padding_common, - dilations, - dtype}; - ConvArgs args4{ - &transformed_dX, ddW, &transformed_dO_channel, strides, padding_common, - dilations, dtype}; - -#ifdef PADDLE_WITH_HIP - miopenConvFwdAlgorithm_t fwd_algo1 = - static_cast(0); - miopenConvFwdAlgorithm_t fwd_algo2 = - static_cast(0); - miopenConvBwdDataAlgorithm_t data_algo = - static_cast(0); - miopenConvBwdWeightsAlgorithm_t filter_algo = - static_cast(0); -#else - cudnnConvolutionFwdAlgo_t fwd_algo1 = - static_cast(0); - cudnnConvolutionFwdAlgo_t fwd_algo2 = - static_cast(0); - cudnnConvolutionBwdDataAlgo_t data_algo = - static_cast(0); - cudnnConvolutionBwdFilterAlgo_t filter_algo = - static_cast(0); -#endif - - auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); - - // ddo = conv(ddI, W) + conv(I, ddW) - size_t workspace_size = 0; - - T* transformed_ddy_channel = nullptr; - if (ddO) { - ddy = ddO->data(); - transformed_ddy_channel = transformed_ddO_channel.data(); - if (ddX) { - args1.handle = handle; - args1.idesc.set(transformed_ddX, iwo_group); - args1.wdesc.set(*W, layout, iwo_group); - args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search1 = SearchAlgorithm; - workspace_size = search1::GetWorkspaceSize(args1); - fwd_algo1 = search1::Find(args1, exhaustive_search, false, - workspace_size, ctx); -#else - using search1 = SearchAlgorithm; - fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); - workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); -#endif - } - - if (ddW) { - ddw = ddW->data(); - args2.handle = handle; - args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search2 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search2::GetWorkspaceSize(args2)); - fwd_algo2 = search2::Find(args2, exhaustive_search, false, - workspace_size, ctx); -#else - using search2 = SearchAlgorithm; - fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); - workspace_size = std::max(workspace_size, - search2::GetWorkspaceSize(args2, fwd_algo2)); -#endif - } - } - - if (dW && ddX) { - dw = dW->data(); - args3.handle = handle; - args3.idesc.set(transformed_ddX, iwo_group); - args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search3 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = search3::Find(args3, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search3 = SearchAlgorithm; - filter_algo = - search3::Find(args3, exhaustive_search, deterministic, ctx); - workspace_size = std::max(workspace_size, - search3::GetWorkspaceSize(args3, filter_algo)); -#endif - } - - if (ddW && dX) { - transformed_dx = transformed_dX.data(); - - args4.handle = handle; - args4.idesc.set(transformed_dX, iwo_group); - args4.wdesc.set(*ddW, layout, iwo_group); - args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, - platform::AllowTF32Cudnn(), c_group); - -#ifdef PADDLE_WITH_HIP - using search4 = SearchAlgorithm; - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = search4::Find(args4, exhaustive_search, deterministic, - workspace_size, ctx); -#else - using search4 = SearchAlgorithm; - data_algo = - search4::Find(args4, exhaustive_search, deterministic, ctx); - workspace_size = - std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); -#endif - } - - int i_n, i_c, i_d, i_h, i_w; - GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, - &i_w); - - int o_n, o_c, o_d, o_h, o_w; - GetNCDHW(transformed_dO_channel.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, - &o_h, &o_w); - - int group_offset_in = i_c / groups * i_h * i_w * i_d; - int group_offset_out = o_c / groups * o_h * o_w * o_d; - int group_offset_filter = W->numel() / groups; - - ScalingParamType alpha = 1.0f; - ScalingParamType beta = 0.0f; - - // NOTE(zhiqiu): inplace addto is not supportted in double grad yet. - // ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : - // 0.0f; - // VLOG(4) << "Conv_grad_grad: use_addto = " << ctx.Attr("use_addto"); - auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); - - if (ddO) { - if (ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), ddx, - args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, - &beta, args1.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, workspace_ptr, workspace_size, &beta, - args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (ddW) { -#ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), - ddw, args2.cdesc.desc(), fwd_algo2, &beta, - args2.odesc.desc(), transformed_ddy_channel, - workspace_ptr, workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionForward( - handle, &alpha, args2.idesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, workspace_ptr, workspace_size, &alpha, - args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out)); - }, - workspace_size); - } -#endif - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_ddO_channel, ddO); - } - } - T* transformed_dy_channel = transformed_dO_channel.data(); - if (dW && ddX) { - ddx = transformed_ddX.data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), transformed_dy_channel, - args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, - &beta, args3.wdesc.desc(), dw, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardFilter( - handle, &alpha, args3.idesc.desc(), - ddx + i * group_offset_in, args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.cdesc.desc(), filter_algo, workspace_ptr, - workspace_size, &beta, args3.wdesc.desc(), - dw + i * group_offset_filter)); - }, - workspace_size); - } -#endif - } - - if (dX && ddW) { - ddw = ddW->data(); -#ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args4.odesc.desc(), transformed_dy_channel, - args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, - &beta, args4.idesc.desc(), transformed_dx, workspace_ptr, - workspace_size)); - }, - workspace_size); -#else - for (int i = 0; i < groups; i++) { - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnConvolutionBackwardData( - handle, &alpha, args4.wdesc.desc(), - ddw + i * group_offset_filter, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.cdesc.desc(), data_algo, workspace_ptr, - workspace_size, &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in)); - }, - workspace_size); - } -#endif - - if (!is_sys_pad) { - // reverse padded input - std::vector starts(X->dims().size(), 0); - std::vector axes(X->dims().size(), 0); - - for (size_t i = 0; i < X->dims().size(); ++i) { - starts[i] = input_pad[2 * i]; - axes[i] = i; - } - if (X->dims().size() == 4) { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } else { - RemovePaddingSlice( - ctx, &transformed_dX, &transformed_dX_channel, starts, axes); - } - } - if (channel_last) { - TransToChannelLast( - ctx, &transformed_dX_channel, dX); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue -// Use depthwise_conv2d in MIOPEN to resolve this issue -REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -#if CUDNN_VERSION_MIN(8, 1, 0) -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#else -REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv2d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad_grad, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif - -REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel, - paddle::operators::CUDNNConvOpKernel); -REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); -REGISTER_OP_KERNEL( - conv3d_grad_grad, CUDNN, plat::CUDAPlace, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel, - paddle::operators::CUDNNConvDoubleGradOpKernel); -#endif diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 9c9795143eb78dc5c1b22ec792d8753f915c976e..66f718693847837a4d169a5cab9629a1f668244f 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -51,12 +52,11 @@ static inline void GetNCDHW(const framework::DDim& dims, } template -static void RemovePaddingSlice(const framework::ExecutionContext& context, +static void RemovePaddingSlice(const phi::GPUContext& context, const Tensor* input, Tensor* out, const std::vector& starts, const std::vector& axes) { - auto& place = - *context.template device_context().eigen_device(); + auto& place = *context.eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); @@ -128,11 +128,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -170,11 +169,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; @@ -212,11 +210,10 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, size_t workspace_size, - const framework::ExecutionContext& ctx) { + const phi::GPUContext& ctx) { algo_t algo; - auto& dev_ctx = ctx.template device_context(); - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); int find_count; miopenConvAlgoPerf_t find_result; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index e345a4d2603b630508e299207984f4708217a1d8..8213e877f722433488cd826bb63cba376972c57a 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -205,14 +205,14 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( paddle::framework::DataTypeToString(input_data_type), paddle::framework::DataTypeToString(filter_data_type))); } -#ifndef PADDLE_WITH_ASCEND_CL - if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ( - library, framework::LibraryType::kCUDNN, - platform::errors::InvalidArgument( - "float16 can only be used when CUDNN or NPU is used")); - } -#endif +// #ifndef PADDLE_WITH_ASCEND_CL +// if (input_data_type == framework::proto::VarType::FP16) { +// PADDLE_ENFORCE_EQ( +// library, framework::LibraryType::kCUDNN, +// platform::errors::InvalidArgument( +// "float16 can only be used when CUDNN or NPU is used")); +// } +// #endif #if PADDLE_WITH_CUDA if (input_data_type == framework::proto::VarType::BF16 && library == framework::LibraryType::kCUDNN) { @@ -869,42 +869,6 @@ REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker); REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); -// depthwise conv kernel -// TODO(xingzhaolong): neon kernel for mobile -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); - -REGISTER_OP_CPU_KERNEL( - depthwise_conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv2d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - -REGISTER_OP_CPU_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL( - conv3d_grad_grad, - ops::GemmConvDoubleGradKernel, - ops::GemmConvDoubleGradKernel); - REGISTER_OP_VERSION(conv2d) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc deleted file mode 100644 index d07593f5c02e9129c1f333667baccb0531bc31f9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/conv_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/conv_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d, - ops::DepthwiseConvKernel, - ops::DepthwiseConvKernel); - -REGISTER_OP_CUDA_KERNEL( - depthwise_conv2d_grad, - ops::DepthwiseConvGradKernel, - ops::DepthwiseConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv2d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv2d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); - -REGISTER_OP_CUDA_KERNEL( - conv3d, ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_CUDA_KERNEL( - conv3d_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 26166362da8a2984dc3c0670b186b85800767fb7..a5d888765bf37d45d501a3dbe5437f7c2ab5fc51 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -214,817 +213,5 @@ class ConvOpDoubleGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override; }; -template -class GemmConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - // The filter will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output(output->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } - - // update padding and dilation - auto trans_in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims = - phi::slice_ddim(trans_in_dims, 2, trans_in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - auto& dev_ctx = context.template device_context(); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - // filter_shape_vec: - // {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - - // output_shape_vec: - // {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: - // {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, - // o_d,o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = trans_in_dims[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: - // (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * o_h * - // o_w) - - framework::DDim col_matrix_shape = phi::flatten_to_2d(col_shape, data_dim); - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - framework::DDim in_matrix_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output.dims()[1], - transformed_output.numel() / - (transformed_output.dims()[0] * transformed_output.dims()[1])}; - - // convolution operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output.dims()[1]) / groups; - - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; - - auto blas = phi::funcs::GetBlas(dev_ctx); - for (int i = 0; i < batch_size; i++) { - Tensor in_batch = - transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); - Tensor out_batch = - transformed_output.Slice(i, i + 1).Resize(output_matrix_shape); - - for (int g = 0; g < groups; g++) { - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice, - T(0.0)); - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); - } - } -}; - -template -class GemmConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - // The filter and filter_grad will be reshaped in the calculations, - // so here use an assignment operation, - // that avoids modifying the variable in the Scope. - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - int groups = context.Attr("groups"); - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - Tensor transformed_input(input->dtype()); - Tensor transformed_output_grad(output_grad->dtype()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - - // update padding and dilation - auto in_dims = transformed_input.dims(); - auto filter_dims = filter.dims(); - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_input.dims()[0]); - - auto& dev_ctx = context.template device_context(); - - // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} - std::vector filter_shape_vec(phi::vectorize(filter.dims())); - // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} - std::vector output_shape_vec( - phi::vectorize(transformed_output_grad.dims())); - - // use col_shape in the im2col calculation - // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, - // o_h, o_w} - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - col_shape_vec[0] = transformed_input.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - - // use col_matrix_shape in the gemm calculation - // size: (i_c/g * k_h * k_w, o_h * o_w) - // or - // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - - framework::DDim input_shape = phi::slice_ddim( - transformed_input.dims(), 1, transformed_input.dims().size()); - - framework::DDim filter_matrix_shape = {filter.dims()[0], - filter.numel() / filter.dims()[0]}; - filter.Resize(filter_matrix_shape); - - framework::DDim output_matrix_shape = { - transformed_output_grad.dims()[1], - transformed_output_grad.numel() / (transformed_output_grad.dims()[0] * - transformed_output_grad.dims()[1])}; - - // convolution backward input operator: gemm + col2im(or col2vol) - // convolution backward weight operator: im2col(or vol2col) + gemm - int in_step = static_cast(transformed_input.dims()[1]) / groups; - int out_step = static_cast(transformed_output_grad.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - - Tensor col; - // col_matrix shares the same piece of data with col, - // but will be reshaped into a two-dimensional matrix shape - // to call the matrix multiplication interface. - Tensor col_matrix; - if (is_expand) { - col = context.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->dtype()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - // if is_expand is false, the operation of set_zero is unnecessary, - // because math::matmul will reset input_grad. - if (is_expand) { - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_grad_batch = - transformed_input_grad.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - - Tensor in_grad_slice = - in_grad_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col_matrix.ShareDataWith(in_grad_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0), - &col_matrix, T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &in_grad_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); - } - } - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - Tensor filter_grad_ = *filter_grad; - filter_grad_.Resize(filter_matrix_shape); - set_zero(dev_ctx, filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; i++) { - Tensor out_grad_batch = - transformed_output_grad.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor in_batch = transformed_input.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // im2col - Tensor out_grad_slice = - out_grad_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); - - if (!is_expand) { - col.ShareDataWith(in_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, in_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - - } else if (data_dim == 3U) { - vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor filter_grad_slice = - filter_grad_.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); - } - } - } - } -}; - -template -class GemmConvDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet("It must use CPUPlace.")); - const Tensor* X = ctx.Input("Input"); - const Tensor* dY = ctx.Input("DOutput"); - const Tensor* ddX = ctx.Input("DDInput"); - const Tensor* ddW_in = ctx.Input("DDFilter"); - - Tensor* ddY = ctx.Output("DDOutput"); - Tensor* dW = ctx.Output("DFilter"); - Tensor* dX = ctx.Output("DInput"); - Tensor W = GET_DATA_SAFELY(ctx.Input("Filter"), "Input", "Filter", - "GemmConvDoubleGrad"); - if (!ddY && !dW && !dX) return; - - const int groups = ctx.Attr("groups"); - const std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::vector dilations = ctx.Attr>("dilations"); - const std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - const std::string data_format = ctx.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_X(X->dtype()); - Tensor transformed_dY(dY->dtype()); - Tensor transformed_ddX(X->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, X, &transformed_X); - TransToChannelFirst(ctx, X, &transformed_X); - - ResizeToChannelFirst(ctx, dY, &transformed_dY); - TransToChannelFirst(ctx, dY, &transformed_dY); - - if (ddX) { - ResizeToChannelFirst(ctx, ddX, &transformed_ddX); - TransToChannelFirst(ctx, ddX, &transformed_ddX); - } - } else { - transformed_X = *X; - transformed_dY = *dY; - if (ddX) { - transformed_ddX = *ddX; - } - } - - // update padding and dilation - auto in_dims = transformed_X.dims(); - auto filter_dims = W.dims(); - - framework::DDim in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - const int batch_size = static_cast(transformed_X.dims()[0]); - std::vector filter_shape_vec(phi::vectorize(W.dims())); - std::vector output_shape_vec( - phi::vectorize(transformed_dY.dims())); - - size_t data_dim = filter_shape_vec.size() - 2; - std::vector col_shape_vec(1 + 2 * data_dim); - // col_shape [in_channel/group, kh, kw, oh, ow] - col_shape_vec[0] = transformed_X.dims()[1] / groups; - for (size_t j = 0; j < data_dim; ++j) { - col_shape_vec[j + 1] = filter_shape_vec[j + 2]; - col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; - } - framework::DDim col_shape(phi::make_ddim(col_shape_vec)); - // col_matrix_shape [in_channel/group * kh * kw, oh * ow] - framework::DDim col_matrix_shape = - phi::flatten_to_2d(col_shape, data_dim + 1); - // input_shape [Cin, H, W] - framework::DDim input_shape = - phi::slice_ddim(transformed_X.dims(), 1, transformed_X.dims().size()); - // filter_matrix_shape [Cout, Cin * kh * kw] - framework::DDim filter_matrix_shape = {W.dims()[0], - W.numel() / W.dims()[0]}; - - W.Resize(filter_matrix_shape); - framework::DDim output_matrix_shape = { - transformed_dY.dims()[1], - transformed_dY.numel() / - (transformed_dY.dims()[0] * transformed_dY.dims()[1])}; - int in_step = static_cast(transformed_X.dims()[1]) / groups; - int out_step = static_cast(transformed_dY.dims()[1]) / groups; - - bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); - Tensor col; - Tensor col_matrix; - if (is_expand) { - col = ctx.AllocateTmpTensor(col_shape, dev_ctx); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } - - phi::funcs::SetConstant set_zero; - auto blas = phi::funcs::GetBlas(dev_ctx); - - // dx convolution double grad: gemm + col2im(col2vol) - // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, - // oH, oW) - if (dX && ddW_in) { - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - dX->mutable_data(ctx.GetPlace()); - - Tensor transformed_dX(dX->dtype()); - - if (channel_last) { - ResizeToChannelFirst(ctx, dX, &transformed_dX); - - } else { - transformed_dX = *dX; - } - // if is_expand is false, the operation of set_zero is unnecessary - // because math::matmul will reset dx - if (is_expand) { - set_zero(dev_ctx, &transformed_dX, static_cast(0)); - } - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; - - for (int i = 0; i < batch_size; i++) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor dx_batch = transformed_dX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; g++) { - // gemm - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col_matrix.ShareDataWith(dx_slice); - col_matrix.Resize(col_matrix_shape); - } - blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, - T(0.0)); - - if (is_expand && data_dim == 2U) { - col2im(dev_ctx, col, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &dx_slice); - } else if (is_expand && data_dim == 3U) { - col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_dX, dX); - } - } - - // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, - // oH, oW) - // dw convolution double grad: im2col(vol2col) + gemm - if (dW && ddX) { - dW->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, dW, static_cast(0)); - Tensor dW_arr = *dW; - dW_arr.Resize(filter_matrix_shape); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor dy_batch = - transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape); - Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape); - for (int g = 0; g < groups; ++g) { - // im2col - Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - - Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice, - T(1.0)); - } - } - } - - // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), - // w/ddw(Cout, Cin, kh, kw) - // ddy convolution double grad: im2col(vol2col) + gemm - if (ddY) { - ddY->mutable_data(ctx.GetPlace()); - - Tensor transformed_ddY(ddY->dtype()); - if (channel_last) { - ResizeToChannelFirst(ctx, ddY, &transformed_ddY); - } else { - transformed_ddY = *ddY; - } - - set_zero(dev_ctx, &transformed_ddY, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; - for (int i = 0; i < batch_size; ++i) { - Tensor ddy_batch = - transformed_ddY.Slice(i, i + 1).Resize(output_matrix_shape); - for (int g = 0; g < groups; ++g) { - // gemm - Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step); - - if (ddX) { - Tensor ddx_batch = - transformed_ddX.Slice(i, i + 1).Resize(input_shape); - Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); - if (!is_expand) { - col.ShareDataWith(ddx_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, ddx_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); - } - Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(0.0)); - } - - if (ddW_in) { - Tensor x_batch = transformed_X.Slice(i, i + 1).Resize(input_shape); - Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); - - Tensor ddW; - ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); - if (!is_expand) { - col.ShareDataWith(x_slice); - col_matrix.ShareDataWith(col); - col_matrix.Resize(col_matrix_shape); - } else if (data_dim == 2U) { - im2col(dev_ctx, x_slice, dilations, strides, - std::vector{paddings[0], paddings[2], paddings[1], - paddings[3]}, - &col); - } else if (data_dim == 3U) { - vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); - } - - // gemm - Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); - blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, - T(1.0)); - } - } - } - if (channel_last) { - TransToChannelLast(ctx, &transformed_ddY, ddY); - } - } - } -}; - -template -class DepthwiseConvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - const std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - if (channel_last) { - PADDLE_ENFORCE_EQ( - output->dims()[output->dims().size() - 1] % - input->dims()[input->dims().size() - 1], - 0, platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[output->dims().size() - 1], - input->dims()[input->dims().size() - 1])); - } else { - PADDLE_ENFORCE_EQ( - output->dims()[1] % input->dims()[1], 0, - platform::errors::InvalidArgument( - "ShapeError: The output channels must be a multiple of the " - "input channels. But receivced output channel number is %d " - "and input channel number is %d", - output->dims()[1], input->dims()[1])); - } - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - auto& dev_ctx = context.template device_context(); - - if (fuse_relu) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } else { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, - output, data_layout); - } - } -}; - -template -class DepthwiseConvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); - const std::string padding_algorithm = - context.Attr("padding_algorithm"); - const std::string data_format = context.Attr("data_format"); - - // update padding and dilation - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_format); - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - bool is_sys_pad = strides.size() * 2 == paddings.size() ? false : true; - if (!is_sys_pad) { - for (size_t i = 0; i < strides.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - - if (input_grad) { - input_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, input_grad, static_cast(0)); - - if (fuse_relu) { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } else { - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, dilations, input_grad, data_layout); - } - } - - if (filter_grad) { - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - if (fuse_relu) { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } else { - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, - paddings, dilations, filter_grad, data_layout); - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 4b8f9d7e6ca8d2f1dae99f1d034c53daf948f922..1841b78af32dd95d6884d5eb78ad30322ba7723e 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_helper.h" #endif #include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -108,7 +108,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_input; @@ -139,12 +139,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, input_transpose, pad_value, &transformed_input); + phi::funcs::PadFunction( + dev_ctx, input_pad, input_transpose, pad_value, + &transformed_input); } break; default: PADDLE_THROW(platform::errors::InvalidArgument( @@ -242,10 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search = SearchAlgorithm; workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); - algo = search::Find(args, false, deterministic, workspace_size, ctx); + algo = search::Find( + args, false, deterministic, workspace_size, + ctx.template device_context()); #else using search = SearchAlgorithm; - algo = search::Find(args, false, deterministic, ctx); + algo = search::Find( + args, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, algo)); #endif @@ -375,7 +381,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); std::vector input_pad(input_transpose.dims().size() * 2, 0); Tensor transformed_output_grad; @@ -407,13 +413,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; case 5: { - math::PadFunction( - ctx, input_pad, output_grad_transpose, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, output_grad_transpose, pad_value, &transformed_output_grad); } break; default: @@ -499,11 +505,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search1 = SearchAlgorithm; workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1)); - data_algo = - search1::Find(args1, false, deterministic, workspace_size, ctx); + data_algo = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - data_algo = search1::Find(args1, false, deterministic, ctx); + data_algo = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); #endif @@ -521,11 +530,14 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - filter_algo = - search2::Find(args2, false, deterministic, workspace_size, ctx); + filter_algo = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - filter_algo = search2::Find(args2, false, deterministic, ctx); + filter_algo = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); #endif @@ -735,7 +747,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_X(X->type()); Tensor transformed_ddX(X->type()); @@ -794,26 +806,28 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (dO) { - math::PadFunction( - ctx, input_pad, transformed_dO_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_dO_channel, pad_value, &transformed_dO); } if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_X_channel, pad_value, &transformed_X); + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_X_channel, pad_value, + &transformed_X); if (ddX) { - math::PadFunction( - ctx, input_pad, transformed_ddX_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_ddX_channel, pad_value, &transformed_ddX); } } break; @@ -940,11 +954,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; workspace_size = search1::GetWorkspaceSize(args1); - bwd_algo1 = - search1::Find(args1, false, deterministic, workspace_size, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, workspace_size, + ctx.template device_context()); #else using search1 = SearchAlgorithm; - bwd_algo1 = search1::Find(args1, false, deterministic, ctx); + bwd_algo1 = search1::Find( + args1, false, deterministic, + ctx.template device_context()); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); #endif } @@ -961,11 +978,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); - bwd_algo2 = - search2::Find(args2, false, deterministic, workspace_size, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, workspace_size, + ctx.template device_context()); #else using search2 = SearchAlgorithm; - bwd_algo2 = search2::Find(args2, false, deterministic, ctx); + bwd_algo2 = search2::Find( + args2, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); #endif @@ -986,11 +1006,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search3 = SearchAlgorithm; workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3)); - filter_algo = - search3::Find(args3, false, deterministic, workspace_size, ctx); + filter_algo = search3::Find( + args3, false, deterministic, workspace_size, + ctx.template device_context()); #else using search3 = SearchAlgorithm; - filter_algo = search3::Find(args3, false, deterministic, ctx); + filter_algo = search3::Find( + args3, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); #endif @@ -1009,11 +1032,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { using search4 = SearchAlgorithm; workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4)); - data_algo = - search4::Find(args4, false, deterministic, workspace_size, ctx); + data_algo = search4::Find( + args4, false, deterministic, workspace_size, + ctx.template device_context()); #else using search4 = SearchAlgorithm; - data_algo = search4::Find(args4, false, deterministic, ctx); + data_algo = search4::Find( + args4, false, deterministic, + ctx.template device_context()); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); #endif diff --git a/paddle/fluid/operators/conv_transpose_op.cu b/paddle/fluid/operators/conv_transpose_op.cu index b2a4910222f1178d23e94eade9580248bb103c88..054cb4b33895b02a816cc2bff82b1c9052bc645d 100644 --- a/paddle/fluid/operators/conv_transpose_op.cu +++ b/paddle/fluid/operators/conv_transpose_op.cu @@ -13,10 +13,150 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/phi/kernels/gpu/depthwise_conv.h" namespace ops = paddle::operators; using CUDA = paddle::platform::CUDADeviceContext; +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +template +class DepthwiseConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + PADDLE_ENFORCE_EQ( + groups, filter.dims()[0], + platform::errors::InvalidArgument( + "groups should be error to the 1st dimension of filter. But " + "received groups is %d and filter dimension[0] is %d", + groups, filter.dims()[0])); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + for (auto v : dilations) { + PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( + "dilations should be 1 in depthwise conv. " + "But received dilations is %d", + v)); + } + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + output->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + phi::funcs::SetConstant set_zero; + set_zero(dev_ctx, output, static_cast(0)); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad( + static_cast::TYPE&>(dev_ctx), + *output, filter, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, output, data_layout); + } +}; + +template +class DepthwiseConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const std::string data_layout_str = + context.Attr("data_format"); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + auto& dev_ctx = context.template device_context(); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + std::string padding_algorithm = + context.Attr("padding_algorithm"); + + auto in_dims = input->dims(); + auto filter_dims = filter.dims(); + + framework::DDim in_data_dims; + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); + } + framework::DDim filter_data_dims = + phi::slice_ddim(filter_dims, 2, filter_dims.size()); + std::vector ksize = phi::vectorize(filter_data_dims); + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + in_data_dims, strides, ksize); + + if (input_grad) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv( + static_cast::TYPE&>(dev_ctx), + *output_grad, filter, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, input_grad, data_layout); + } + + if (filter_grad) { + phi::funcs::SetConstant set_zero; + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad( + static_cast::TYPE&>(dev_ctx), + *output_grad, *input, strides, + std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, + dilations, filter_grad, data_layout); + } + } +}; + +} // namespace operators +} // namespace paddle // conv2d REGISTER_OP_CUDA_KERNEL(conv2d_transpose, ops::GemmConvTransposeKernel, diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 76d6ad6bf2ff7361a90fb6f013f989db5a2b8845..ee0fb7ab3683364f6db3cffd7ddef67c61f19433 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/phi/kernels/funcs/blas/blas.h" @@ -578,130 +577,5 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } }; -template -class DepthwiseConvTransposeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - Tensor filter = *context.Input("Filter"); - Tensor* output = context.Output("Output"); - output->mutable_data(context.GetPlace()); - - int groups = context.Attr("groups"); - PADDLE_ENFORCE_EQ( - groups, filter.dims()[0], - platform::errors::InvalidArgument( - "groups should be error to the 1st dimension of filter. But " - "received groups is %d and filter dimension[0] is %d", - groups, filter.dims()[0])); - - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - for (auto v : dilations) { - PADDLE_ENFORCE_EQ(v, 1, platform::errors::InvalidArgument( - "dilations should be 1 in depthwise conv. " - "But received dilations is %d", - v)); - } - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - output->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, output, static_cast(0)); - - math::DepthwiseConvInputGradFunctor - depthwiseConvInputGrad; - depthwiseConvInputGrad( - dev_ctx, *output, filter, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, output, data_layout); - } -}; - -template -class DepthwiseConvTransposeGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const std::string data_layout_str = - context.Attr("data_format"); - const framework::DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const Tensor* input = context.Input("Input"); - const Tensor* output_grad = - context.Input(framework::GradVarName("Output")); - Tensor* input_grad = - context.Output(framework::GradVarName("Input")); - Tensor* filter_grad = - context.Output(framework::GradVarName("Filter")); - Tensor filter = *context.Input("Filter"); - - if (!input_grad && !filter_grad) return; - - auto& dev_ctx = context.template device_context(); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::vector dilations = context.Attr>("dilations"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - auto in_dims = input->dims(); - auto filter_dims = filter.dims(); - - framework::DDim in_data_dims; - if (data_layout != framework::DataLayout::kNHWC) { - in_data_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } else { - in_data_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } - framework::DDim filter_data_dims = - phi::slice_ddim(filter_dims, 2, filter_dims.size()); - std::vector ksize = phi::vectorize(filter_data_dims); - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - in_data_dims, strides, ksize); - - if (input_grad) { - math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv( - dev_ctx, *output_grad, filter, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, input_grad, data_layout); - } - - if (filter_grad) { - phi::funcs::SetConstant set_zero; - filter_grad->mutable_data(context.GetPlace()); - set_zero(dev_ctx, filter_grad, static_cast(0)); - - math::DepthwiseConvFilterGradFunctor - depthwiseConvFilterGrad; - depthwiseConvFilterGrad( - dev_ctx, *output_grad, *input, strides, - std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - dilations, filter_grad, data_layout); - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc index fe00ee06603f0ecf2e3fa6ac367303a70702508f..674b75625d1983ba97f3d47ee154beff79c42dad 100644 --- a/paddle/fluid/operators/cross_op.cc +++ b/paddle/fluid/operators/cross_op.cc @@ -109,8 +109,8 @@ class CrossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, - PT_INFER_META(phi::CrossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, + PD_INFER_META(phi::CrossInferMeta)); REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker, ops::CrossGradMaker, ops::CrossGradMaker, diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h deleted file mode 100644 index ab3860ecafc3569c13b0b9e5c882df9ddc03e190..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cum_op.h +++ /dev/null @@ -1,115 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -template -class CumKernel : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - - void Compute(const framework::ExecutionContext& context) const override { - auto& X = GET_DATA_SAFELY(context.Input("X"), "Input", - "X", "Cum"); - - auto& Out = GET_DATA_SAFELY(context.Output("Out"), - "Output", "Out", "Cum"); - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = Out.dims(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - Out.template mutable_data(context.GetPlace()); - - int pre = 1; - int post = 1; - int mid = out_dims[axis]; - for (int i = 0; i < axis; ++i) { - pre *= out_dims[i]; - } - for (int i = axis + 1; i < out_dims.size(); ++i) { - post *= out_dims[i]; - } - - auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(Out); - auto* place = - context.template device_context().eigen_device(); - - using IndexT = Eigen::DenseIndex; - if (pre == 1) { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(mid), x, out, - /* axis= */ 0, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(mid, post), x, out, - /* axis= */ 0, reverse, exclusive); - } - } else { - if (post == 1) { - ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, - /* axis= */ 1, reverse, exclusive); - } else { - ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, - /* axis= */ 1, reverse, exclusive); - } - } - } - - private: - template - void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, - bool reverse, bool exclusive) const { - if (!reverse) { - out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); - } else { - std::array rev; - rev.fill(false); - rev[axis] = reverse; - out.reshape(dims).device(d) = - Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); - } - } -}; - -template -struct CumsumFunctor { - using ELEMENT_TYPE = T; - template - const typename X::TensorScanSumOp operator()(X x, int axis, - bool exclusive) const { - return x.cumsum(axis, exclusive); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cumprod_op.cc b/paddle/fluid/operators/cumprod_op.cc index bff6673429d9a4088c65f9dc02c1546f23d96878..90910bbbb2050bad85d10e0467a099c42030c084 100644 --- a/paddle/fluid/operators/cumprod_op.cc +++ b/paddle/fluid/operators/cumprod_op.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cumprod_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace operators { @@ -87,16 +88,3 @@ REGISTER_OPERATOR(cumprod, ops::CumprodOp, ops::CumprodOpMaker, ops::CumprodGradOpMaker); REGISTER_OPERATOR(cumprod_grad, ops::CumprodGradOp); - -REGISTER_OP_CPU_KERNEL( - cumprod, ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel, ops::CumprodOpCPUKernel, - ops::CumprodOpCPUKernel>, - ops::CumprodOpCPUKernel>); - -REGISTER_OP_CPU_KERNEL( - cumprod_grad, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel, - ops::CumprodGradOpCPUKernel>, - ops::CumprodGradOpCPUKernel>); diff --git a/paddle/fluid/operators/cumprod_op.cu b/paddle/fluid/operators/cumprod_op.cu deleted file mode 100644 index f792d6832917f52573dce7ee3e449c2f4be63584..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cumprod_op.cu +++ /dev/null @@ -1,369 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/cumprod_op.h" -#include "paddle/fluid/operators/math/inclusive_scan.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -template -struct MultiplyFunctor { - HOSTDEVICE T operator()(T a, T b) const { return a * b; } -}; - -template -class CumprodOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - auto *y = ctx.Output("Out"); - auto dim = ctx.Attr("dim"); - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - - const auto *x_data = x->data(); - auto *y_data = y->mutable_data(ctx.GetPlace()); - const auto &dev_ctx = - ctx.template device_context(); - math::InclusiveScan>( - x_data, y_data, outer_dim, mid_dim, inner_dim, static_cast(1), - MultiplyFunctor(), /*reverse=*/false, dev_ctx); - } -}; - -template -struct IsZeroFunctor { - HOSTDEVICE bool operator()(T x) const { return x == static_cast(0); } -}; - -template -struct CumprodGradFunctorExceptFirstZero { - HOSTDEVICE CumprodGradFunctorExceptFirstZero( - const T *x, const T *y, const T *dy_mul_y_reversed_cumsum, - const uint8_t *zero_mask, size_t mid_dim, size_t inner_dim, T *dx, - int64_t *first_zero_idx, T *x_filled_one) - : x_(x), - y_(y), - dy_mul_y_reversed_cumsum_(dy_mul_y_reversed_cumsum), - zero_mask_(zero_mask), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx), - first_zero_idx_(first_zero_idx), - x_filled_one_(x_filled_one) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto inner_idx = idx % inner_dim_; - auto outer_idx = idx / (mid_dim_ * inner_dim_); - auto mid_idx = (idx - inner_idx) / inner_dim_ % mid_dim_; - auto mask = zero_mask_[idx]; - bool should_fill_one = true; - - if (mask == 0) { - dx_[idx] = dy_mul_y_reversed_cumsum_[idx] / x_[idx]; - if (mid_idx == mid_dim_ - 1) { - // record first zero position as -1, i.e., no zero - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = -1; - } - } else if (mid_idx > 0) { // mask > 0 - if (zero_mask_[idx - inner_dim_] > 0) { // not first zero - dx_[idx] = 0; - should_fill_one = false; - } else { - // idx is the first zero position, it should be recorded - dx_[idx] = y_[idx - inner_dim_]; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = mid_idx; - } - } else { // the first zero position is index 0 - dx_[idx] = 1; - first_zero_idx_[outer_idx * inner_dim_ + inner_idx] = 0; - } - - x_filled_one_[idx] = should_fill_one ? 1 : x_[idx]; - } - - private: - const T *x_; - const T *y_; - const T *dy_mul_y_reversed_cumsum_; - const uint8_t *zero_mask_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; - int64_t *first_zero_idx_; - T *x_filled_one_; -}; - -template -struct FillFirstZeroPositionGradFunctor { - HOSTDEVICE FillFirstZeroPositionGradFunctor(const int64_t *first_zero_idx, - const T *grad_value, - size_t mid_dim, size_t inner_dim, - T *dx) - : first_zero_idx_(first_zero_idx), - grad_value_(grad_value), - mid_dim_(mid_dim), - inner_dim_(inner_dim), - dx_(dx) {} - - HOSTDEVICE void operator()(size_t idx) const { - auto outer_idx = idx / inner_dim_; - auto inner_idx = idx % inner_dim_; - auto mid_idx = first_zero_idx_[idx]; - if (mid_idx >= 0) { - auto full_idx = - outer_idx * mid_dim_ * inner_dim_ + mid_idx * inner_dim_ + inner_idx; - dx_[full_idx] *= grad_value_[full_idx]; - } - } - - private: - const int64_t *first_zero_idx_; - const T *grad_value_; - size_t mid_dim_; - size_t inner_dim_; - T *dx_; -}; - -/* -Reference to -https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp -input: x, y, dL/dy -output: dL/dx -dL/dx[i] = sum{0<=j k, dL/dx[i] = 0; -i < k, dL/dx[i] = 1/x[i]*sum{i<=j k - dx[i] = 0; - x_filled_one[i] = x[i]; - } - } - } -} -T = reversed_cumsum(dy[j]*cumprod(x_filled_one[j])); -if (zero_index != -1) { - dx[zero_index] *= T[zero_index]; -} -*/ - -template -class CumprodGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *y = ctx.Input("Out"); - const auto *dy = - ctx.Input(framework::GradVarName("Out")); - auto *dx = ctx.Output(framework::GradVarName("X")); - auto dim = ctx.Attr("dim"); - - size_t outer_dim, mid_dim, inner_dim; - GetCumprodDimInfo(x->dims(), dim, &outer_dim, &mid_dim, &inner_dim); - if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; - - size_t numel = outer_dim * mid_dim * inner_dim; - - const auto *x_data = x->data(); - const auto *y_data = y->data(); - const auto *dy_data = dy->data(); - - auto place = ctx.GetPlace(); - const auto &dev_ctx = - ctx.template device_context(); - auto *dx_data = dx->mutable_data(place); - - // deal with complex - const T *x_data_deal; - const T *y_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr y_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto *x_data_conj = reinterpret_cast(x_conj->ptr()); - y_conj = memory::Alloc(place, numel * sizeof(T)); - auto *y_data_conj = reinterpret_cast(y_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_y(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_y(y_data, numel, y_data_conj); - for_range_y(functor_y); - x_data_deal = x_data_conj; - y_data_deal = y_data_conj; - } else { - x_data_deal = x_data; - y_data_deal = y_data; - } - -// Step 1: find cummax-ed zero mask of x -#ifdef PADDLE_WITH_CUDA - const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); -#else - const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); -#endif - auto zero_mask_without_cummax = - memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_without_cummax_data = - reinterpret_cast(zero_mask_without_cummax->ptr()); - thrust::transform( - exec_policy, thrust::device_pointer_cast(x_data_deal), - thrust::device_pointer_cast(x_data_deal) + numel, - thrust::device_pointer_cast(zero_mask_without_cummax_data), - IsZeroFunctor()); - - auto zero_mask = memory::Alloc(place, numel * sizeof(uint8_t)); - auto *zero_mask_data = reinterpret_cast(zero_mask->ptr()); - math::InclusiveScan( - zero_mask_without_cummax_data, zero_mask_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Max(), /*reverse=*/false, - dev_ctx); - zero_mask_without_cummax = nullptr; - - // Step 2: calculate reversed cumsum(dy * y) - auto dy_mul_y = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_data = reinterpret_cast(dy_mul_y->ptr()); - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(y_data_deal), - thrust::device_pointer_cast(dy_mul_y_data), - MultiplyFunctor()); - - auto dy_mul_y_reversed_cumsum = memory::Alloc(place, numel * sizeof(T)); - auto *dy_mul_y_reversed_cumsum_data = - reinterpret_cast(dy_mul_y_reversed_cumsum->ptr()); - math::InclusiveScan( - dy_mul_y_data, dy_mul_y_reversed_cumsum_data, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), /*reverse=*/true, dev_ctx); - - // Step 3: calculate the gradient value except the first zero position. - // The gradient value of the first zero position is filled with out[idx-1], - // while the gradient value of the other positions are calculated out - // completely. This functor also: - // (1) find the first zero index, i.e., first_zero_idx_data. - // (2) fill x_filled_one, which satifies - // x_filled_one[i] = x[i], i > pos - // x_filled_one[i] = 1, i <= pos - auto first_zero_idx = - memory::Alloc(place, outer_dim * inner_dim * sizeof(int64_t)); - auto *first_zero_idx_data = - reinterpret_cast(first_zero_idx->ptr()); - auto *x_filled_one_data = dy_mul_y_data; // reuse former allocated memory - platform::ForRange for_range(dev_ctx, numel); - CumprodGradFunctorExceptFirstZero functor_except_first_zero( - x_data_deal, y_data_deal, dy_mul_y_reversed_cumsum_data, zero_mask_data, - mid_dim, inner_dim, dx_data, first_zero_idx_data, x_filled_one_data); - for_range(functor_except_first_zero); - - // Step 4: calculate cumprod of x_filled_one - auto *x_filled_one_cumprod_data = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan>( - x_filled_one_data, x_filled_one_cumprod_data, outer_dim, mid_dim, - inner_dim, static_cast(1), MultiplyFunctor(), /*reverse=*/false, - dev_ctx); - - // Step 5: calculate reversed cumsum(dy * x_filled_one_cumprod) - auto *dy_mul_x_filled_one_cumprod = - dy_mul_y_data; // reuse former allocated memory - thrust::transform(exec_policy, thrust::device_pointer_cast(dy_data), - thrust::device_pointer_cast(dy_data) + numel, - thrust::device_pointer_cast(x_filled_one_cumprod_data), - thrust::device_pointer_cast(dy_mul_x_filled_one_cumprod), - MultiplyFunctor()); - auto *dy_mul_x_filled_one_cumprod_reversed_cumsum = - dy_mul_y_reversed_cumsum_data; // reuse former allocated memory - math::InclusiveScan( - dy_mul_x_filled_one_cumprod, - dy_mul_x_filled_one_cumprod_reversed_cumsum, outer_dim, mid_dim, - inner_dim, static_cast(0), cub::Sum(), - /*reverse=*/true, dev_ctx); - - // Step 6: fill zero pos gradient value - platform::ForRange - for_range_fill_zero_pos_grad(dev_ctx, outer_dim * inner_dim); - FillFirstZeroPositionGradFunctor fill_first_zero_pos_grad_functor( - first_zero_idx_data, dy_mul_x_filled_one_cumprod_reversed_cumsum, - mid_dim, inner_dim, dx_data); - for_range_fill_zero_pos_grad(fill_first_zero_pos_grad_functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - cumprod, ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel, ops::CumprodOpCUDAKernel, - ops::CumprodOpCUDAKernel>, - ops::CumprodOpCUDAKernel>); - -REGISTER_OP_CUDA_KERNEL( - cumprod_grad, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel, - ops::CumprodGradOpCUDAKernel>, - ops::CumprodGradOpCUDAKernel>); diff --git a/paddle/fluid/operators/cumprod_op.h b/paddle/fluid/operators/cumprod_op.h deleted file mode 100644 index 74ed2008ae98380388d874529264c6b6c0b5a49a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cumprod_op.h +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -static void GetCumprodDimInfo(const framework::DDim& dim, int cumprod_dim, - size_t* outer_dim, size_t* mid_dim, - size_t* inner_dim) { - PADDLE_ENFORCE_GE( - cumprod_dim, -dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be larger than the opposite " - "rank of input x which is %d.But received dim=%d", - -dim.size(), cumprod_dim)); - PADDLE_ENFORCE_LT(cumprod_dim, dim.size(), - platform::errors::InvalidArgument( - "The input dim of CumprodOp should be smaller than the " - "rank of input x which is %d.But received dim=%d", - dim.size(), cumprod_dim)); - if (cumprod_dim < 0) cumprod_dim += dim.size(); - - *outer_dim = 1; - for (int i = 0; i < cumprod_dim; ++i) { - *outer_dim *= dim[i]; - } - *mid_dim = dim[cumprod_dim]; - *inner_dim = 1; - for (int i = cumprod_dim + 1; i < dim.size(); ++i) { - *inner_dim *= dim[i]; - } -} - -template -class CumprodOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - int dim = context.Attr("dim"); - - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - framework::DDim shape = x->dims(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t j = 0; j < mid_dim; j++) { - for (size_t k = 0; k < inner_dim; k++) { - size_t pos = i * mid_dim * inner_dim + j * inner_dim + k; - if (j == 0) { - out_data[pos] = x_data[pos]; - } else { - out_data[pos] = out_data[pos - inner_dim] * x_data[pos]; - } - } - } - } - } -}; - -template -class CumprodGradOpCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const { - const Tensor* d_out = context.Input(framework::GradVarName("Out")); - const Tensor* x = context.Input("X"); - const Tensor* out = context.Input("Out"); - - int dim = context.Attr("dim"); - framework::DDim shape = x->dims(); - Tensor* d_x = context.Output(framework::GradVarName("X")); - - auto* d_out_data = d_out->data(); - auto* x_data = x->data(); - auto* out_data = out->data(); - auto* d_x_data = d_x->mutable_data(context.GetPlace()); - - auto place = context.GetPlace(); - const auto& dev_ctx = - context.template device_context(); - - size_t outer_dim = 1; - size_t mid_dim = 1; - size_t inner_dim = 1; - GetCumprodDimInfo(shape, dim, &outer_dim, &mid_dim, &inner_dim); - size_t numel = outer_dim * mid_dim * inner_dim; - - // deal with complex - const T* x_data_deal; - const T* out_data_deal; - memory::AllocationPtr x_conj; - memory::AllocationPtr out_conj; - if (framework::IsComplex::value) { - x_conj = memory::Alloc(place, numel * sizeof(T)); - auto* x_data_conj = reinterpret_cast(x_conj->ptr()); - out_conj = memory::Alloc(place, numel * sizeof(T)); - auto* out_data_conj = reinterpret_cast(out_conj->ptr()); - - platform::ForRange for_range_x(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_x(x_data, numel, x_data_conj); - for_range_x(functor_x); - - platform::ForRange for_range_out(dev_ctx, - numel); - phi::funcs::ConjFunctor functor_out(out_data, numel, out_data_conj); - for_range_out(functor_out); - - x_data_deal = x_data_conj; - out_data_deal = out_data_conj; - } else { - x_data_deal = x_data; - out_data_deal = out_data; - } - - for (size_t i = 0; i < outer_dim; i++) { - for (size_t k = 0; k < inner_dim; k++) { - for (size_t j = 0; j < mid_dim; j++) { - size_t index = i * mid_dim * inner_dim + j * inner_dim + k; - d_x_data[index] = 0; - for (size_t n = 0; n < mid_dim; n++) { - size_t pos = i * mid_dim * inner_dim + n * inner_dim + k; - T elem; - if (j == 0) { - elem = d_out_data[pos]; - } else { - elem = d_out_data[pos] * out_data_deal[index - inner_dim]; - } - if (pos > index) { - for (size_t m = index + inner_dim; m <= pos; m += inner_dim) { - elem *= x_data_deal[m]; - } - } else if (pos < index) { - elem = static_cast(0); - } - d_x_data[index] += elem; - } - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 9fa355a924612651556f2a79711cae4ce17379f8..11633fb0b870327f14e4454b3f94a43940a9df53 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/cum_op.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,17 +24,6 @@ namespace operators { class CumOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->Attrs().Get("flatten")) { - ctx->SetOutputDim("Out", - phi::make_ddim({phi::product(ctx->GetInputDim("X"))})); - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,15 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; - +DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, + PD_INFER_META(phi::CumsumInferMeta)); REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, - ops::CumsumGradMaker); -REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>, - ops::CumKernel>); + ops::CumsumGradMaker, + CumsumInferShapeFunctor); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu deleted file mode 100644 index 3402f42521f54f315390fe2162309fb204fd9b00..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cumsum_op.cu +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/operators/cum_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -using Tensor = paddle::framework::Tensor; -using LoDTensor = paddle::framework::LoDTensor; - -namespace paddle { -namespace operators { - -template -__device__ void BlockReverse(const T* idata, T* odata, int src_base, - int dst_base, int valid_item) { - __shared__ T sh_mem[BLOCK_SIZE]; - int tx = threadIdx.x; - - int offset = tx; - int in_index = src_base + offset; - if (offset >= valid_item) { - sh_mem[offset] = 0; - } else { - int sh_mem_index = BLOCK_SIZE - offset - 1; - T data = idata[in_index]; - sh_mem[sh_mem_index] = data; - } - - __syncthreads(); - int out_index = dst_base - offset; - if (offset < valid_item) { - int sh_mem_index = BLOCK_SIZE - offset - 1; - odata[out_index] = sh_mem[sh_mem_index]; - } -} - -template -__global__ void MatrixRowReverse(const T* matrix_data, T* reverse_data, - int reverse_size, int outer_size, - int inner_size) { - int bx = blockIdx.x; - int by = blockIdx.y; - int item_per_block = 1024; - - for (int block_offset = 0; block_offset < reverse_size; - block_offset += item_per_block) { - int valid_item = (reverse_size - block_offset > item_per_block) - ? item_per_block - : reverse_size - block_offset; - int src_offset = - bx * reverse_size + block_offset + by * (inner_size * reverse_size); - int dst_offset = bx * reverse_size + by * (inner_size * reverse_size) + - reverse_size - 1 - block_offset; - if (reverse_size < item_per_block) { - valid_item = reverse_size; - } - - BlockReverse(matrix_data, reverse_data, src_offset, dst_offset, - valid_item); - } -} - -template -struct BlockPrefixCallbackOp { - // Running prefix - T running_total; - // Constructor - __device__ BlockPrefixCallbackOp(T running_total) - : running_total(running_total) {} - // Callback operator to be entered by the first warp of threads in the block. - // Thread-0 is responsible for returning a value for seeding the block-wide - // scan. - __device__ T operator()(T block_aggregate) { - T old_prefix = running_total; - running_total = old_prefix + block_aggregate; - return old_prefix; - } -}; - -// No bank-conflict transpose -template -__global__ void MatrixTranspose(T* odata, const T* idata, size_t height, - size_t width) { - __shared__ T tile[TILE_DIM][TILE_DIM + 1]; - - int x = blockIdx.x * TILE_DIM + threadIdx.x; - int y = blockIdx.y * TILE_DIM + threadIdx.y; - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < width && (y + j) < height) { - tile[threadIdx.y + j][threadIdx.x] = idata[(y + j) * width + x]; - } else { - tile[threadIdx.y + j][threadIdx.x] = 0; - } - } - - __syncthreads(); - - x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset - y = blockIdx.x * TILE_DIM + threadIdx.y; - - for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) { - if (x < height && (y + j) < width) { - odata[(y + j) * height + x] = tile[threadIdx.x][threadIdx.y + j]; - } - } -} - -template -__global__ void BlockScanKernel(T* d_out, const T* d_in, int inner_size, - int outer_size, int scan_size, bool exclusive) { - // Specialize BlockLoad, BlockStore, and BlockRadixSort collective types - typedef cub::BlockLoad - BlockLoadT; - typedef cub::BlockStore - BlockStoreT; - typedef cub::BlockScan BlockScanT; - // Allocate type-safe, repurposable shared memory for collectives - __shared__ union { - typename BlockLoadT::TempStorage load; - typename BlockStoreT::TempStorage store; - typename BlockScanT::TempStorage scan; - } temp_storage; - - int bx = blockIdx.x; - int by = blockIdx.y; - - BlockPrefixCallbackOp prefix_op(0); - T block_aggregate = static_cast(0); - - // Obtain this block's segment of consecutive keys (blocked across threads) - int item_per_block = BLOCK_THREADS * ITEMS_PER_THREAD; - for (int block_offset = 0; block_offset < scan_size; - block_offset += BLOCK_THREADS * ITEMS_PER_THREAD) { - int valid_item = (scan_size - block_offset > item_per_block) - ? item_per_block - : (scan_size - block_offset); - if (scan_size < item_per_block) { - valid_item = scan_size; - } - - int offset = bx * scan_size + block_offset + by * (inner_size * scan_size); - - T thread_keys[ITEMS_PER_THREAD]; - BlockLoadT(temp_storage.load) - .Load(d_in + offset, thread_keys, valid_item, 0); - - __syncthreads(); - if (exclusive) { - T init_value = static_cast(0); - BlockScanT(temp_storage.scan) - .ExclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } else { - BlockScanT(temp_storage.scan) - .InclusiveScan(thread_keys, thread_keys, cub::Sum(), prefix_op); - } - __syncthreads(); - - BlockStoreT(temp_storage.store) - .Store(d_out + offset, thread_keys, valid_item); - } -} - -template -class CumCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int axis = context.Attr("axis"); - bool exclusive = context.Attr("exclusive"); - bool reverse = context.Attr("reverse"); - auto out_dims = out->dims(); - auto size = in->numel(); - - PADDLE_ENFORCE_EQ( - axis < out_dims.size() && axis >= (0 - out_dims.size()), true, - platform::errors::OutOfRange( - "Attr(axis) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(axis) = %d.", - out_dims.size(), out_dims.size() - 1, axis)); - if (axis < 0) { - axis += out_dims.size(); - } - - T* out_data = out->mutable_data(context.GetPlace()); - const T* in_data = in->data(); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ‘axis’ dimension. - if (size == out_dims[axis]) { - if (reverse) { - thrust::device_ptr dev_ptr = - thrust::device_pointer_cast(in_data); - thrust::device_vector vec(dev_ptr, dev_ptr + size); - if (exclusive) { - thrust::exclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } else { - thrust::inclusive_scan(thrust::device, vec.rbegin(), vec.rend(), - out_data); - } - thrust::reverse(thrust::device, out_data, out_data + size); - } else { - if (exclusive) { - thrust::exclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } else { - thrust::inclusive_scan(thrust::device, in_data, in_data + size, - out_data); - } - } - return; - } - - size_t height = 1; - size_t width = 1; - for (size_t i = 0; i <= axis; i++) { - height *= out_dims[i]; - } - - for (size_t i = axis + 1; i < out_dims.size(); i++) { - width *= out_dims[i]; - } - int scan_size = out_dims[axis]; - bool transpose = (axis != out_dims.size() - 1); - - int tile_size = 32; - dim3 blocks(32, 8); - dim3 transpose_grids((width + tile_size - 1) / tile_size, - (height + tile_size - 1) / tile_size); - auto& dev_ctx = context.template device_context(); - framework::Tensor tmp; - tmp.Resize(out_dims); - auto* tmp_data = tmp.mutable_data(context.GetPlace()); - T* next_in_data = out_data; - T* next_out_data = tmp_data; - if (transpose) { - MatrixTranspose<<>>( - out_data, in_data, height, width); - next_in_data = out_data; - next_out_data = tmp_data; - } - auto swap_ptr = [](T*& ptr1, T*& ptr2) { - T* tmp = ptr2; - ptr2 = ptr1; - ptr1 = tmp; - }; - int outer_size = height / scan_size; - int inner_size = width; - // Consider the size of shared memory, here block size is 128 - dim3 scan_grid(outer_size, inner_size); - dim3 reverse_grid = scan_grid; - if (reverse) { - if (transpose) { - reverse_grid.x = scan_grid.y; - reverse_grid.y = scan_grid.x; - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - if (!transpose) next_in_data = tmp_data; - swap_ptr(next_in_data, next_out_data); - } else { - MatrixRowReverse<<>>( - in_data, out_data, scan_size, outer_size, inner_size); - } - } - if (!transpose && !reverse) { - BlockScanKernel<<>>( - out_data, in_data, outer_size, inner_size, scan_size, exclusive); - - } else { - BlockScanKernel<<>>( - next_out_data, next_in_data, outer_size, inner_size, scan_size, - exclusive); - } - swap_ptr(next_in_data, next_out_data); - if (reverse) { - MatrixRowReverse<<>>( - next_in_data, next_out_data, scan_size, outer_size, inner_size); - swap_ptr(next_in_data, next_out_data); - } - if (transpose) { - transpose_grids.x = (height + tile_size - 1) / tile_size; - transpose_grids.y = (width + tile_size - 1) / tile_size; - MatrixTranspose<<>>( - next_out_data, next_in_data, width, height); - } - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cumsum, ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel, - ops::CumCUDAKernel); diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc index 38bf53ca0aa1a2dddca4ac2d2043de10fcdb7830..d197e4362e96976661ab891929b4503977f52ff0 100644 --- a/paddle/fluid/operators/cumsum_op_npu.cc +++ b/paddle/fluid/operators/cumsum_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/cum_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 1ebafa54598574ae9027a4887639a2a1d27448ea..568c7982cfc7c07b9c7f840ccaa32e4025225122 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -62,7 +62,7 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc) detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc) detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) -detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu) +detection_library(yolo_box_op SRCS yolo_box_op.cc) detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu) detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 7927410ef37862499aadf61d6e04c45af157f347..83cf6e5fd30f6bcad4870d1ebd18a50e21518dfe 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -93,7 +93,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { // Here the box_dims[0] is not the real dimension of output. // It will be rewritten in the computing kernel. if (score_size == 3) { - ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); + ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); } else { ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); } @@ -545,11 +545,10 @@ class MultiClassNMS2Op : public MultiClassNMSOp { void InferShape(framework::InferShapeContext* ctx) const override { MultiClassNMSOp::InferShape(ctx); - auto box_dims = ctx->GetInputDim("BBoxes"); auto score_dims = ctx->GetInputDim("Scores"); auto score_size = score_dims.size(); if (score_size == 3) { - ctx->SetOutputDim("Index", {box_dims[1], 1}); + ctx->SetOutputDim("Index", {-1, 1}); } else { ctx->SetOutputDim("Index", {-1, 1}); } diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 511d8e0eed1065ae0cd2cec3a7bcf534cd3043ab..0d9fbf612f73c428fb8050fcfcc319ddafabe482 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -9,7 +9,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -240,8 +239,6 @@ REGISTER_OPERATOR( yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, - ops::YoloBoxKernel); REGISTER_OP_VERSION(yolo_box) .AddCheckpoint( diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu deleted file mode 100644 index fb5c214a59e1274ffc30226bf49a068df960f414..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/detection/yolo_box_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/kernels/funcs/math_function.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, - T* scores, const float conf_thresh, - const int* anchors, const int n, const int h, - const int w, const int an_num, const int class_num, - const int box_num, int input_size_h, - int input_size_w, bool clip_bbox, const float scale, - const float bias, bool iou_aware, - const float iou_aware_factor) { - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - T box[4]; - for (; tid < n * box_num; tid += stride) { - int grid_num = h * w; - int i = tid / box_num; - int j = (tid % box_num) / grid_num; - int k = (tid % grid_num) / w; - int l = tid % w; - - int an_stride = (5 + class_num) * grid_num; - int img_height = imgsize[2 * i]; - int img_width = imgsize[2 * i + 1]; - - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, - iou_aware); - T conf = sigmoid(input[obj_idx]); - if (iou_aware) { - int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); - T iou = sigmoid(input[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, - iou_aware); - GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, - input_size_w, box_idx, grid_num, img_height, img_width, scale, - bias); - box_idx = (i * box_num + j * grid_num + k * w + l) * 4; - CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, - 5, iou_aware); - int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; - CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, - grid_num); - } -} - -template -class YoloBoxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* img_size = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - auto& dev_ctx = ctx.cuda_device_context(); - int bytes = sizeof(int) * anchors.size(); - auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); - int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); - const auto gplace = ctx.GetPlace(); - const auto cplace = platform::CPUPlace(); - memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, - dev_ctx.stream()); - - const T* input_data = input->data(); - const int* imgsize_data = img_size->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(dev_ctx, boxes, static_cast(0)); - set_zero(dev_ctx, scores, static_cast(0)); - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num); - - dim3 thread_num = config.thread_per_block; -#ifdef WITH_NV_JETSON - if (config.compute_capability == 53 || config.compute_capability == 62) { - thread_num = 512; - } -#endif - - KeYoloBoxFw<<>>( - input_data, imgsize_data, boxes_data, scores_data, conf_thresh, - anchors_data, n, h, w, an_num, class_num, box_num, input_size_h, - input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel, - ops::YoloBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h deleted file mode 100644 index 2cd69c60b7c44d0557c23b8d1bd933650e8402c3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/detection/yolo_box_op.h +++ /dev/null @@ -1,180 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -HOSTDEVICE inline T sigmoid(T x) { - return 1.0 / (1.0 + std::exp(-x)); -} - -template -HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, - int j, int an_idx, int grid_size_h, - int grid_size_w, int input_size_h, - int input_size_w, int index, int stride, - int img_height, int img_width, float scale, - float bias) { - box[0] = (i + sigmoid(x[index]) * scale + bias) * img_width / grid_size_w; - box[1] = (j + sigmoid(x[index + stride]) * scale + bias) * img_height / - grid_size_h; - box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / - input_size_w; - box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * - img_height / input_size_h; -} - -HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, - int an_num, int an_stride, int stride, - int entry, bool iou_aware) { - if (iou_aware) { - return (batch * an_num + an_idx) * an_stride + - (batch * an_num + an_num + entry) * stride + hw_idx; - } else { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; - } -} - -HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, - int an_stride, int stride) { - return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + - hw_idx; -} - -template -HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx, - const int img_height, - const int img_width, bool clip_bbox) { - boxes[box_idx] = box[0] - box[2] / 2; - boxes[box_idx + 1] = box[1] - box[3] / 2; - boxes[box_idx + 2] = box[0] + box[2] / 2; - boxes[box_idx + 3] = box[1] + box[3] / 2; - - if (clip_bbox) { - boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); - boxes[box_idx + 1] = - boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); - boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 - ? boxes[box_idx + 2] - : static_cast(img_width - 1); - boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 - ? boxes[box_idx + 3] - : static_cast(img_height - 1); - } -} - -template -HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input, - const int label_idx, const int score_idx, - const int class_num, const T conf, - const int stride) { - for (int i = 0; i < class_num; i++) { - scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); - } -} - -template -class YoloBoxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* imgsize = ctx.Input("ImgSize"); - auto* boxes = ctx.Output("Boxes"); - auto* scores = ctx.Output("Scores"); - auto anchors = ctx.Attr>("anchors"); - int class_num = ctx.Attr("class_num"); - float conf_thresh = ctx.Attr("conf_thresh"); - int downsample_ratio = ctx.Attr("downsample_ratio"); - bool clip_bbox = ctx.Attr("clip_bbox"); - bool iou_aware = ctx.Attr("iou_aware"); - float iou_aware_factor = ctx.Attr("iou_aware_factor"); - float scale = ctx.Attr("scale_x_y"); - float bias = -0.5 * (scale - 1.); - - const int n = input->dims()[0]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - const int box_num = boxes->dims()[1]; - const int an_num = anchors.size() / 2; - int input_size_h = downsample_ratio * h; - int input_size_w = downsample_ratio * w; - - const int stride = h * w; - const int an_stride = (class_num + 5) * stride; - - Tensor anchors_; - auto anchors_data = - anchors_.mutable_data({an_num * 2}, ctx.GetPlace()); - std::copy(anchors.begin(), anchors.end(), anchors_data); - - const T* input_data = input->data(); - const int* imgsize_data = imgsize->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); - memset(boxes_data, 0, boxes->numel() * sizeof(T)); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); - memset(scores_data, 0, scores->numel() * sizeof(T)); - - T box[4]; - for (int i = 0; i < n; i++) { - int img_height = imgsize_data[2 * i]; - int img_width = imgsize_data[2 * i + 1]; - - for (int j = 0; j < an_num; j++) { - for (int k = 0; k < h; k++) { - for (int l = 0; l < w; l++) { - int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 4, iou_aware); - T conf = sigmoid(input_data[obj_idx]); - if (iou_aware) { - int iou_idx = - GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride); - T iou = sigmoid(input_data[iou_idx]); - conf = pow(conf, static_cast(1. - iou_aware_factor)) * - pow(iou, static_cast(iou_aware_factor)); - } - if (conf < conf_thresh) { - continue; - } - - int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 0, iou_aware); - GetYoloBox(box, input_data, anchors_data, l, k, j, h, w, - input_size_h, input_size_w, box_idx, stride, - img_height, img_width, scale, bias); - box_idx = (i * box_num + j * stride + k * w + l) * 4; - CalcDetectionBox(boxes_data, box, box_idx, img_height, img_width, - clip_bbox); - - int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, - stride, 5, iou_aware); - int score_idx = (i * box_num + j * stride + k * w + l) * class_num; - CalcLabelScore(scores_data, input_data, label_idx, score_idx, - class_num, conf, stride); - } - } - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 375ef4344f4741c947ef3134696d64cdae696780..f89ecd37222870f73d00870c9454bf5590d504e3 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,17 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace paddle { namespace operators { @@ -172,7 +178,7 @@ template class DeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* det = context.Input("Out"); const auto* grad = @@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel { // checked in forward, pass } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (det(A)=0) if (!CheckMatrixInvertible(context, det)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; ddet->Resize(input->dims()); - ddet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, ddet, static_cast(0.0f)); + phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), + ddet); return; } @@ -218,35 +227,35 @@ class DeterminantGradKernel : public framework::OpKernel { // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, // -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " << transpose_inverse_A.dims(); // Third: dA * |A| - auto mul_dA_detA = helper.Mul(*grad, *det); + auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); @@ -331,7 +340,7 @@ template class SlogDeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* slogdet = context.Input("Out"); const auto* grad = @@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel { input->dims().size() - grad->dims().size())); } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); @@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); - dslogdet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, dslogdet, std::numeric_limits::quiet_NaN()); + phi::Full(dev_ctx, phi::vectorize(input->dims()), + std::numeric_limits::quiet_NaN(), dslogdet); return; } @@ -373,34 +385,25 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // we set dsl|A| = unsqueeze(dslA, [-1, -2]) * // inverse(A).conj().transpose(-2, -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).conj() - framework::Tensor conj_inverse_A; - conj_inverse_A.Resize(inverse_A.dims()); - auto numel = input->numel(); - auto* conj_data = conj_inverse_A.mutable_data(context.GetPlace(), - size_t(numel * sizeof(T))); - - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::ConjFunctor functor(inverse_A.data(), numel, conj_data); - for_range(functor); + auto conj_inverse_A = phi::Conj(dev_ctx, inverse_A); VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims(); // Third: inverse(A).conj().transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, conj_inverse_A); VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: " << transpose_inverse_A.dims(); @@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel { det_grad.Resize(det_grad.dims().reshape(det_grad_vec)); // Fifth: unsqueeze(dslA, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(det_grad, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dslA) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims(); framework::TensorCopy(res, context.GetPlace(), dslogdet); diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 0160277dc79af50c555b1257e6ffa216b7b56b62..ac8c12bcd7ebaa6f47e8d3582887ac327a9f8957 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/infermeta/unary.h" @@ -58,15 +56,56 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } }; +class DiagV2GradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "X", "X", "DiagV2Grad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "DiagV2Grad"); + + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class DiagV2GradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("diag_v2_grad"); + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagGradV2NoNeedBufferVarsInferer, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, - PT_INFER_META(phi::DiagInferMeta)); - -REGISTER_OPERATOR( - diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - DiagInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, + PD_INFER_META(phi::DiagInferMeta)); + +REGISTER_OPERATOR(diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, + ops::DiagV2GradOpMaker, + ops::DiagV2GradOpMaker, + DiagInferShapeFunctor); + +REGISTER_OPERATOR(diag_v2_grad, ops::DiagV2GradOp, + ops::DiagGradV2NoNeedBufferVarsInferer); diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc index b419f629a1e635c5a463b732af3003e93a5674d6..bf3cc941539eaeb2e03f53eb2465532469be5697 100644 --- a/paddle/fluid/operators/diagonal_op.cc +++ b/paddle/fluid/operators/diagonal_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,74 +23,6 @@ namespace operators { class DiagonalOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "diagonal"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diagonal"); - - int offset_ = ctx->Attrs().Get("offset"); - int axis1 = ctx->Attrs().Get("axis1"); - int axis2 = ctx->Attrs().Get("axis2"); - - auto x_dims = ctx->GetInputDim("Input"); - int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; - int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; - - PADDLE_ENFORCE_GE( - x_dims.size(), 2, - platform::errors::OutOfRange("Input's dim is out of range (expected at " - "least 2 dimensions, but got %ld).", - x_dims.size())); - PADDLE_ENFORCE_LT( - axis1_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(axis1) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), axis1)); - PADDLE_ENFORCE_LT( - axis2_, x_dims.size(), - platform::errors::OutOfRange( - "Attr(axis2) is out of range (expected to be in range of [%ld, " - "%ld], but got %ld).", - -(x_dims.size()), (x_dims.size() - 1), axis2)); - PADDLE_ENFORCE_NE(axis1_, axis2_, - platform::errors::InvalidArgument( - "The dimensions should not be identical " - "%d vs %d.", - axis1, axis2)); - - auto out_dims = vectorize(x_dims); - // from out_dims get the dim size of axis1_. - auto axis1_size = out_dims[axis1_]; - auto axis2_size = out_dims[axis2_]; - // delete two dims by attr axis1 and axis2 from out_dims. - /* example: - out_dim = [2, 3, 4]; - axis1 = 0; - axis2 = 1; - according to the attr of axis1 and axis2, we get: - out_dim = [4]. - */ - out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); - out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); - - if (offset_ == 0) { - out_dims.push_back(std::min(axis1_size, axis2_size)); - } else if (offset_ > 0) { - if ((axis2_size - offset_) > 0) { - out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); - } else { - out_dims.push_back(0); - } - } else { - if ((axis1_size + offset_) > 0) { - out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); - } else { - out_dims.push_back(0); - } - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - } }; class DiagonalOpMaker : public framework::OpProtoAndCheckerMaker { @@ -170,9 +105,13 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(diagonal, DiagonalInferShapeFunctor, + PD_INFER_META(phi::DiagonalInferMeta)); + REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker, ops::DiagonalGradOpMaker, - ops::DiagonalGradOpMaker); + ops::DiagonalGradOpMaker, + DiagonalInferShapeFunctor); REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp, ops::DiagonalGradNoNeedBufferVarsInferer) diff --git a/paddle/fluid/operators/dist_op.cc b/paddle/fluid/operators/dist_op.cc index 3a53f1365567f99c9446077f7939d87c156c9a08..55b2484941293c8db47ef847bea959ebe82ff3ae 100644 --- a/paddle/fluid/operators/dist_op.cc +++ b/paddle/fluid/operators/dist_op.cc @@ -12,10 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/dist_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -121,13 +124,11 @@ class DistGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(dist, DistInferShapeFunctor, + PD_INFER_META(phi::DistInferMeta)); + REGISTER_OPERATOR(dist, ops::DistOp, ops::DistOpMaker, ops::DistGradOpMaker, - ops::DistGradOpMaker); + ops::DistGradOpMaker, + DistInferShapeFunctor); REGISTER_OPERATOR(dist_grad, ops::DistOpGrad); -REGISTER_OP_CPU_KERNEL( - dist, ops::DistKernel, - ops::DistKernel); -REGISTER_OP_CPU_KERNEL( - dist_grad, ops::DistGradKernel, - ops::DistGradKernel) diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h deleted file mode 100644 index dfd7e29a8d0102261746ab47d3e1e805a674d7b1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dist_op.h +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -using EigenTensor = framework::EigenTensor; -using framework::Tensor; - -template -static void GetBraodcastDims(const framework::DDim& x_dims, - const framework::DDim& y_dims, - Eigen::DSizes* x_bcast_dims, - Eigen::DSizes* y_bcast_dims) { - int bcast_dims_remainder = 0; - for (int i = 0; i < x_dims.size(); ++i) { - if (x_dims[i] >= y_dims[i]) { - (*x_bcast_dims)[i] = 1; - (*y_bcast_dims)[i] = x_dims[i] / y_dims[i]; - bcast_dims_remainder += x_dims[i] % y_dims[i]; - } else { - (*y_bcast_dims)[i] = 1; - (*x_bcast_dims)[i] = y_dims[i] / x_dims[i]; - bcast_dims_remainder += y_dims[i] % x_dims[i]; - } - } - PADDLE_ENFORCE_EQ(bcast_dims_remainder, 0, - platform::errors::PreconditionNotMet( - "The input tensor of Op(dist) could not be broadcast, " - "X's shape is [%s], Y's shape is [%s].", - x_dims, y_dims)); -} - -static framework::DDim GetNewDims(const framework::DDim& in_dims, int rank) { - std::vector new_dims_vec(rank); - if (in_dims.size() < rank) { - for (int i = 0; i < rank - in_dims.size(); ++i) { - new_dims_vec[i] = 1; - } - for (int i = 0; i < in_dims.size(); ++i) { - new_dims_vec[i + rank - in_dims.size()] = in_dims[i]; - } - } else { - new_dims_vec = vectorize(in_dims); - } - return phi::make_ddim(new_dims_vec); -} - -template -static void DistFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Output("Out"); - auto p = context.Attr("p"); - out->mutable_data(context.GetPlace()); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - - // new dims with same size as rank, e.g. (rank=3, (4, 3) => (1, 4, 3)) - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - // p=0 means number of non-zero elements of (x-y) - // p=inf means the maximum of |x-y| - // p=-inf means the minimum of |x-y| - // otherwise, Lp-norm = pow(sum(pow(|x-y|, p)), 1/p) - if (p == 0) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) != y_t.broadcast(y_bcast_dims)) - .template cast() - .sum(); - } else if (p == INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .maximum(); - } else if (p == -INFINITY) { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .minimum(); - } else { - out_t.device(place) = - (x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims)) - .abs() - .pow(p) - .sum() - .pow(1.0 / p); - } -} - -template -static void DistGradFunction(const framework::ExecutionContext& context) { - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* out = context.Input("Out"); - auto p = context.Attr("p"); - - auto x_grad = context.Output(framework::GradVarName("X")); - auto y_grad = context.Output(framework::GradVarName("Y")); - auto out_grad = context.Input(framework::GradVarName("Out")); - - auto x_dims = context.Input("X")->dims(); - auto y_dims = context.Input("Y")->dims(); - auto out_dims = context.Input("Out")->dims(); - - framework::DDim x_new_dims = GetNewDims(x_dims, Rank); - framework::DDim y_new_dims = GetNewDims(y_dims, Rank); - framework::DDim out_new_dims = GetNewDims(out_dims, Rank); - auto x_t = EigenTensor::From(*x, x_new_dims); - auto y_t = EigenTensor::From(*y, y_new_dims); - auto out_t = EigenTensor::From(*out, out_new_dims); - - Eigen::DSizes x_bcast_dims; - Eigen::DSizes y_bcast_dims; - Eigen::DSizes out_bcast_dims; - - GetBraodcastDims(x_new_dims, y_new_dims, &x_bcast_dims, &y_bcast_dims); - std::vector new_dims_vec(Rank); - for (int i = 0; i < Rank; ++i) { - new_dims_vec[i] = std::max(x_new_dims[i], y_new_dims[i]); - out_bcast_dims[i] = new_dims_vec[i]; - } - framework::DDim new_dims = phi::make_ddim(new_dims_vec); - - auto& place = - *context.template device_context().eigen_device(); - auto out_grad_t = EigenTensor::From(*out_grad, out_new_dims); - framework::Tensor grad; - grad.mutable_data(new_dims, context.GetPlace()); - auto grad_t = EigenTensor::From(grad); - - auto x_minux_y = x_t.broadcast(x_bcast_dims) - y_t.broadcast(y_bcast_dims); - auto x_minux_y_abs = x_minux_y.abs(); - auto sign = - (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + - (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); - T epsilon = static_cast(1.0e-10f); - - // 1: Lp-norm(z), z = x-y, compute dz - if (p == 0) { - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, &grad, static_cast(0)); - } else if (p == INFINITY || p == -INFINITY) { - // p=inf or -inf, Lp-norm = |z_i|, the j-th element of dz tends to 0 if - // j!=i, or equals to sign(z_i) * dout if j=i. - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = (x_minux_y_abs == out_t.broadcast(out_bcast_dims)) - .template cast() * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } else { - // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout - if (platform::is_cpu_place(context.GetPlace())) { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign.eval() * out_grad_t.broadcast(out_bcast_dims); - } else { - grad_t.device(place) = - (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) - .pow(p - 1) * - sign * out_grad_t.broadcast(out_bcast_dims); - } - } - - Eigen::DSizes x_reshape_dims; - Eigen::DSizes y_reshape_dims; - Eigen::DSizes reduce_dims; - for (int i = 0; i < x_new_dims.size(); ++i) { - x_reshape_dims[2 * i] = x_bcast_dims[i]; - x_reshape_dims[2 * i + 1] = x_new_dims[i]; - y_reshape_dims[2 * i] = y_bcast_dims[i]; - y_reshape_dims[2 * i + 1] = y_new_dims[i]; - reduce_dims[i] = 2 * i; - } - - // 2: if x or y is broadcasted in forward function, - // the grad need to be sum along the broadcasted dimensions - if (x_grad) { - x_grad->mutable_data(context.GetPlace()); - auto x_grad_t = EigenTensor::From(*x_grad, x_new_dims); - x_grad_t.device(place) = grad_t.reshape(x_reshape_dims) - .sum(reduce_dims) - .reshape(x_grad_t.dimensions()); - } - if (y_grad) { - y_grad->mutable_data(context.GetPlace()); - auto y_grad_t = EigenTensor::From(*y_grad, y_new_dims); - y_grad_t.device(place) = -grad_t.reshape(y_reshape_dims) - .sum(reduce_dims) - .reshape(y_grad_t.dimensions()); - } -} - -template -class DistKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistFunction(context); - break; - case 2: - DistFunction(context); - break; - case 3: - DistFunction(context); - break; - case 4: - DistFunction(context); - break; - case 5: - DistFunction(context); - break; - case 6: - DistFunction(context); - break; - } - } -}; - -template -class DistGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto x_rank = context.Input("X")->dims().size(); - auto y_rank = context.Input("Y")->dims().size(); - auto rank = std::max(x_rank, y_rank); - PADDLE_ENFORCE_LE(rank, 6, - platform::errors::Unimplemented( - "Op(dist) only support tensors with no more than 6 " - "dimensions, but X's rank is %d, Y's rank is %d.", - x_rank, y_rank)); - switch (rank) { - case 1: - DistGradFunction(context); - break; - case 2: - DistGradFunction(context); - break; - case 3: - DistGradFunction(context); - break; - case 4: - DistGradFunction(context); - break; - case 5: - DistGradFunction(context); - break; - case 6: - DistGradFunction(context); - break; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h deleted file mode 100644 index c13bf687af23470d4595def6fb6fabf7385c999f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/distribution_helper.h +++ /dev/null @@ -1,244 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -#endif - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/core/hostdevice.h" - -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/phi/kernels/primitive/kernel_primitives.h" -#endif - -#if !defined(_WIN32) -#define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) -#else -// there is no equivalent intrinsics in msvc. -#define UNLIKELY(condition) (condition) -#endif - -namespace paddle { -namespace distribution { - -using Tensor = framework::Tensor; - -/********************* Transformation Function **********************/ -template -struct exponential_transform { - explicit exponential_transform(T lambda) : lambda_(lambda) {} - - HOSTDEVICE inline T operator()(T val) const { -#if defined(__NVCC__) || defined(__HIPCC__) - if (std::is_same::value) { - return static_cast(-1.0) / lambda_ * log(val); - } else { - return static_cast(-1.0) / lambda_ * __logf(val); - } -#else - return static_cast(-1.0) / lambda_ * std::log(static_cast(1.0) - val); -#endif - } - - private: - T lambda_; -}; - -template -struct uniform_transform { - explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {} - - HOSTDEVICE inline T operator()(T val) const { - if (UNLIKELY(val == static_cast(1.0))) { - return min_; - } else { - return val * range_ + min_; - } - } - - private: - T range_; - T min_; -}; - -template -struct normal_transform { - explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {} - - HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; } - - private: - T mean_; - T std_; -}; - -#if defined(__NVCC__) || defined(__HIPCC__) - -namespace kps = phi::kps; - -/*********************** Distribution Function *************************/ -template -struct uniform_distribution; - -template -struct normal_distribution; - -#if defined(__NVCC__) -template <> -struct uniform_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_uniform4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct uniform_distribution { - __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_uniform2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -template <> -struct normal_distribution { - __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { - return curand_normal4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct normal_distribution { - __device__ inline double2 operator()( - curandStatePhilox4_32_10_t *state) const { - return curand_normal2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -#else -template <> -struct uniform_distribution { - __device__ inline float4 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_uniform4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct uniform_distribution { - __device__ inline double2 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_uniform2_double(state); - } - static constexpr int kReturnsCount = 2; -}; - -template <> -struct normal_distribution { - __device__ inline float4 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_normal4(state); - } - static constexpr int kReturnsCount = 4; -}; - -template <> -struct normal_distribution { - __device__ inline double2 operator()( - hiprandStatePhilox4_32_10_t *state) const { - return hiprand_normal2_double(state); - } - static constexpr int kReturnsCount = 2; -}; -#endif - -/******** Launch GPU function of distribution and transformation *********/ -template -__global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, - DistOp dist, TransformOp trans, T *out_data, - size_t stride) { - size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); - static constexpr int kCount = DistOp::kReturnsCount; -#if defined(__NVCC__) - curandStatePhilox4_32_10_t state; - curand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = curandStatePhilox4_32_10_t; -#else - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed, idx + THREAD_ID_X, offset, &state); - using SType = hiprandStatePhilox4_32_10_t; -#endif - size_t total_thread = GRID_NUM_X * BLOCK_NUM_X; - T args[kCount]; - T result[kCount]; - for (size_t i = idx; i < size; i += total_thread * kCount) { - kps::ElementwiseRandom(&args[0], dist, &state); - kps::ElementwiseUnary(&result[0], &args[0], - trans); - kps::WriteData(out_data + i, &result[0], size - i, - 1, stride, 1); - __syncthreads(); - } -} - -template -void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx, - Tensor *out, DistOp dist, TransformOp trans) { - T *out_data = out->mutable_data(dev_ctx.GetPlace()); - auto size = out->numel(); - - int64_t device_id = dev_ctx.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - size_t block_size = 256; - size_t expect_grid_size = (size + block_size - 1) / block_size; - const auto &prop = platform::GetDeviceProperties(device_id); - size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) * - prop.multiProcessorCount; - size_t grid_size = - expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size; - - size_t total_thread = block_size * grid_size; - size_t curand4_loop_times = - (size + 4 * total_thread - 1) / (4 * total_thread); - // 'increment' shoulde be multiple of 4 - uint64_t increment = curand4_loop_times * 4; - - auto seed_offset = gen_cuda->IncrementOffset(increment); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; - - DistributionKernel< - T, DistOp, TransformOp><<>>( - size, seed, offset, dist, trans, out_data, total_thread); -} - -#endif - -} // namespace distribution -} // namespace paddle diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc index a86a3bb35927d53d20bef91a0bf36695a268c348..8efdd15781a6f2dab48c0680ba87c7b427dc60ec 100644 --- a/paddle/fluid/operators/dot_op.cc +++ b/paddle/fluid/operators/dot_op.cc @@ -101,8 +101,8 @@ class DotOpGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, - PT_INFER_META(phi::DotInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor, + PD_INFER_META(phi::DotInferMeta)); REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker, ops::DotOpGradMaker, diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 2fa956a2e6515e8b6a8e1c463c8ab8d1476f8d90..144198367d538e178a745c22902bb77a65f45fe4 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -32,10 +32,9 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/dropout_impl_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/aligned_vector.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { @@ -86,8 +85,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, bool is_upscale_in_train, uint64_t increment) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; #ifdef PADDLE_WITH_HIP int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; @@ -102,7 +101,7 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, MT factor = static_cast(1.0f / (1.0f - dropout_prob)); for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) { LoadT src_val; - platform::Load(&src[i], &src_val); + phi::Load(&src[i], &src_val); #ifdef PADDLE_WITH_HIP float4 rand = hiprand_uniform4(&state); @@ -126,8 +125,8 @@ __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, } } - platform::Store(dst_val, &dst[i]); - platform::Store(mask_val, &mask[i]); + phi::Store(dst_val, &dst[i]); + phi::Store(mask_val, &mask[i]); } } @@ -153,16 +152,16 @@ __global__ void DropoutGradCUDAKernel( const typename details::MPTypeTrait::Type factor, const int64_t size, T* dx) { using MT = typename details::MPTypeTrait::Type; - using LoadT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_val; - platform::Load(&dout[i], &dout_val); + phi::Load(&dout[i], &dout_val); MaskLoadT mask_val; - platform::Load(&mask[i], &mask_val); + phi::Load(&mask[i], &mask_val); LoadT dx_val; @@ -172,27 +171,28 @@ __global__ void DropoutGradCUDAKernel( static_cast(mask_val[j]) * factor); } - platform::Store(dx_val, &dx[i]); + phi::Store(dx_val, &dx[i]); } } template -void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - bool is_test, +void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, const std::string dropout_implementation, float dropout_prob, bool upscale_in_train, - bool is_fix_seed, int seed_val, const Tensor& x, - const Tensor* seed, Tensor* mask, Tensor* y) { + bool is_fix_seed, int seed_val, + const framework::Tensor& x, + const framework::Tensor* seed, + framework::Tensor* mask, framework::Tensor* y) { auto& place = *dev_ctx.eigen_device(); + int64_t x_numel = x.numel(); + auto stream = dev_ctx.stream(); + auto* x_data = x.data(); + auto* y_data = y->data(); if (!is_test) { - int64_t x_numel = x.numel(); - auto stream = dev_ctx.stream(); auto* mask_data = mask->data(); size_t size = phi::product(mask->dims()); - auto* x_data = x.data(); - auto* y_data = y->data(); if (dropout_prob == 1.0f) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( @@ -219,8 +219,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, uint64_t increment; // VectorizedRandomGenerator use curand_uniform4, so we only support // vec_size is 4; - int vec_size = (platform::GetVectorizedSize(x_data) == 4) ? 4 : 1; - auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); + int vec_size = (phi::GetVectorizedSize(x_data) == 4) ? 4 : 1; + auto gpu_config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; @@ -254,22 +255,37 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, } #endif } else { - auto X = EigenMatrix::Reshape(x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); if (upscale_in_train) { - Y.device(place) = X; +// todo: can y share with data with x directly? +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + hipMemcpyDeviceToDevice, stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemcpyAsync(y_data, x_data, sizeof(T) * x_numel, + cudaMemcpyDeviceToDevice, stream)); +#endif } else { - Y.device(place) = X * static_cast(1.0f - dropout_prob); + using MT = typename details::MPTypeTrait::Type; + MT factor = static_cast(1.0f - dropout_prob); + std::vector ins = {&x}; + std::vector outs = {y}; + auto functor = phi::funcs::ScaleFunctor(factor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } } } template -void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, +void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, const std::string dropout_implementation, - float dropout_prob, const Tensor& grad_y, - const Tensor& mask, int64_t size, - Tensor* grad_x, bool is_test = false) { + float dropout_prob, + const framework::Tensor& grad_y, + const framework::Tensor& mask, int64_t size, + framework::Tensor* grad_x, + bool is_test = false) { using MT = typename details::MPTypeTrait::Type; auto stream = dev_ctx.stream(); MT factor; diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index d7db7dddce3887ca25ea1df34048f15663b2e987..c62d45570ba291dc60120c393d21842cc6548c61 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, +inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx, const framework::Tensor* seed, const bool is_fix_seed, const int seed_val, const int offset, uint64_t* seed_data, diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 7613b04bccfdc2084decc0b383eec199f7e10991..6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, ops::DropoutGradOpMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); -REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel, - ops::CPUDropoutKernel, - ops::CPUDropoutKernel); -REGISTER_OP_CPU_KERNEL( - dropout_grad, - ops::DropoutGradKernel, - ops::DropoutGradKernel, - ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu deleted file mode 100644 index f6ddff1d0327d3c7961781f875da69f89df1edec..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dropout_op.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/dropout_impl.cu.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. -// Use std::random and thrust::random(thrust is a std library in CUDA) to -// implement uniform random. -template -class GPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = context.cuda_device_context(); - auto* mask = context.Output("Mask"); - mask->mutable_data(context.GetPlace()); - - bool is_fix_seed = context.Attr("fix_seed"); - int seed_val = context.Attr("seed"); - DropoutFwGPUKernelDriver(dev_ctx, is_test, dropout_implementation, - dropout_prob, upscale_in_train, is_fix_seed, - seed_val, *x, seed, mask, y); - } -}; - -template -class GPUDropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - auto size = grad_x->numel(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - float dropout_prob = context.Attr("dropout_prob"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = - context.template device_context(); - DropoutGradGPUKernelDriver(dev_ctx, dropout_implementation, dropout_prob, - *grad_y, *mask, size, grad_x, is_test); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - dropout, ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL( - dropout_grad, ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h deleted file mode 100644 index ea6ed0e61947470c22f18e47acce2fca4cb9c41f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dropout_op.h +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -template -class CPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - const auto* x_data = x->data(); - auto* y_data = y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - if (!context.Attr("is_test")) { - auto* mask = context.Output("Mask"); - auto* mask_data = mask->mutable_data(context.GetPlace()); - size_t size = phi::product(mask->dims()); - - // Special case when dropout_prob is 1.0 - if (dropout_prob == 1.0f) { - std::memset(y_data, 0, size * sizeof(*y_data)); // NOLINT - std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT - return; - } - // std::minstd_rand engine; - // NOTE: fixed seed should only be used in unittest or for debug. - // Guarantee to use random seed in training. - int seed_data = 0; - if (seed) { - seed_data = *(seed->data()); - } else { - seed_data = - context.Attr("fix_seed") ? context.Attr("seed") : 0; - } - auto engine = framework::GetCPURandomEngine(seed_data); - - std::uniform_real_distribution dist(0, 1); - - for (size_t i = 0; i < size; ++i) { - if (dist(*engine) < dropout_prob) { - mask_data[i] = 0; - y_data[i] = 0; - } else { - mask_data[i] = 1; - if (upscale_in_train) { - y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); - } else { - y_data[i] = x_data[i]; - } - } - } - } else { - if (upscale_in_train) { - const auto* X_data = x->data(); - auto* Y_data = y->mutable_data(context.GetPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < x->numel(); i++) { - Y_data[i] = X_data[i]; - } - } else { - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = - *context.template device_context().eigen_device(); - Y.device(place) = X * static_cast(1.0f - dropout_prob); - } - } - } -}; -template -class DropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - - auto dX = EigenVector::Flatten(*grad_x); - auto dY = EigenVector::Flatten(*grad_y); - - auto& place = - *context.template device_context().eigen_device(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - if (context.Attr("is_test") == true) { - if (dropout_implementation == "upscale_in_train") { - dX.device(place) = static_cast(1) * dY; - } else { - float dropout_prob = context.Attr("dropout_prob"); - dX.device(place) = dY * static_cast(1.0f - dropout_prob); - } - } else { - auto M = EigenVector::Flatten(*mask); - if (dropout_implementation == "upscale_in_train") { - float dropout_prob = context.Attr("dropout_prob"); - if (dropout_prob == 1.0f) { - dX.device(place) = static_cast(0) * dY; - } else { - dX.device(place) = - dY * M.cast() / static_cast(1.0f - dropout_prob); - } - } else { - dX.device(place) = dY * M.cast(); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc index 6aae566760623c666f3ce82a890a119e3e173390..07b3b5381162575cbfc03dd8cc10d0c88a2d21e8 100644 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 206d9a6c5e9c9869216f0a6c137accc931aa2a77..bdf08646f1d8b94d6d8d141d8a9fa9864cdc937b 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -24,14 +24,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(dropout); +USE_OP_ITSELF(dropout); void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // init diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index 07b7e2cc7c09b09d6640f49fce438d58d0cc9cf2..7d8660f238abc8446b2988aad24a64c565e01ef9 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" + #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { #ifdef PADDLE_WITH_XPU +using Tensor = framework::Tensor; template class DropoutXPUKernel : public framework::OpKernel { using XPUTyp = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h index e9c6c1eb7eced7b74294b679ce88a7eb76e90440..5e4c83e1a45ebdb96a0e764cfa2d3997442ae1ea 100644 --- a/paddle/fluid/operators/eig_op.h +++ b/paddle/fluid/operators/eig_op.h @@ -18,12 +18,19 @@ #include #include #include "paddle/fluid/operators/math/matrix_solve.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/slice.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + #define EPSILON 1e-6 namespace paddle { @@ -214,12 +221,17 @@ class EigKernel : public framework::OpKernel { ApplyEigKernel>( *x, &real_values, &real_vectors, context); - auto dito = math::DeviceIndependenceTensorOperations< - DeviceContext, phi::dtype::Real, Tout>(context); + + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); // 1. extract real part & imag part from real_values - Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); - Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); + Tensor real_part = + phi::funcs::Slice(dev_ctx, real_values, {-1}, {0}, {order}); + Tensor imag_part = phi::funcs::Slice(dev_ctx, real_values, {-1}, + {order}, {order * 2}); // 2. construct complex values auto* real_part_data = real_part.data>(); @@ -233,7 +245,8 @@ class EigKernel : public framework::OpKernel { for_range(functor); // 3. construct complex vectors - Tensor real_vector_trans = dito.Transpose(real_vectors); + Tensor real_vector_trans = + phi::TransposeLast2Dim(dev_ctx, real_vectors); Tensor out_vectors_trans; out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); ConstructComplexVectors, Tout>( @@ -251,45 +264,48 @@ class EigKernel : public framework::OpKernel { } }; -template +template void ComputeBackwardForComplexInput( const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV, - Tout* x_grad_data, int batch_count, int order, + T* x_grad_data, int batch_count, int order, const framework::ExecutionContext& context) { - auto dito = - math::DeviceIndependenceTensorOperations( - context); - - Tensor trans_v = dito.Transpose(V); - Tensor Vh = dito.Conj(trans_v); - Tensor Lconj = dito.Conj(L); - Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1)); - Tensor VhgV = dito.Matmul(Vh, gV); - Tensor diag_real = dito.Real(VhgV); - Tensor diag_res = dito.BatchDiag(diag_real, batch_count); - Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2); + auto& orig_dev_ctx = context.template device_context(); + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + + Tensor trans_v = phi::TransposeLast2Dim(dev_ctx, V); + Tensor Vh = phi::Conj(dev_ctx, trans_v); + Tensor Lconj = phi::Conj(dev_ctx, L); + Tensor Econj = phi::Subtract(dev_ctx, phi::funcs::Unsqueeze(Lconj, -2), + phi::funcs::Unsqueeze(Lconj, -1)); + Tensor VhgV = phi::Matmul(dev_ctx, Vh, gV); + Tensor diag_real = phi::Real(dev_ctx, VhgV); + Tensor diag_res = phi::funcs::BatchDiag(dev_ctx, diag_real, batch_count); + Tensor diag_unsqueezed = phi::funcs::Unsqueeze(diag_res, -2); // turn diag_unsqueezed into complex auto numel = diag_unsqueezed.numel(); Tensor diag_unsqueezed_complex; - auto* data_diag_un = diag_unsqueezed.data>(); - auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( + auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( diag_unsqueezed.dims(), context.GetPlace(), - static_cast(numel * sizeof(Tout))); - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, - numel); + static_cast(numel * sizeof(T))); + + platform::ForRange for_range(orig_dev_ctx, numel); + phi::funcs::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, + numel); for_range(functor); // real tensor multiply complex tensor in broadcast manner - Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); - Tensor res2 = dito.Matmul(Vh, res1); - Tensor result = dito.Sub(VhgV, res2); + Tensor res1 = phi::Multiply(dev_ctx, V, diag_unsqueezed_complex); + Tensor res2 = phi::Matmul(dev_ctx, Vh, res1); + Tensor result = phi::Subtract(dev_ctx, VhgV, res2); - result.mutable_data(V.dims(), context.GetPlace()); - result = dito.Div(result, Econj); - result = dito.DiagFill(order, order, order, 0, gL, result); - Tensor rhs = dito.Matmul(result, Vh); + result.mutable_data(V.dims(), context.GetPlace()); + result = phi::Divide(dev_ctx, result, Econj); + result = + phi::funcs::DiagFill(dev_ctx, order, order, order, 0, gL, result); + Tensor rhs = phi::Matmul(dev_ctx, result, Vh); // solve linear system // solve(Vh, rhs, out, m, k) @@ -298,10 +314,10 @@ void ComputeBackwardForComplexInput( // x_grad: out int m = Vh.dims()[Vh.dims().size() - 1]; int k = rhs.dims()[rhs.dims().size() - 1]; - auto* matrix_data = Vh.data(); - auto* rhs_data = rhs.data(); - math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, - batch_count); + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, + batch_count); } template diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc index 553d0e679cc6ddebd68c3edbc2de70209364bb53..4e33c567eb6d12fc504bfd76bc83072836feda21 100644 --- a/paddle/fluid/operators/eigh_op.cc +++ b/paddle/fluid/operators/eigh_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eigh_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,42 +25,9 @@ using framework::Tensor; class EighOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", - "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", - "Eigh"); - - auto input_dim = ctx->GetInputDim("X"); - auto rank = input_dim.size(); - - PADDLE_ENFORCE_GE(rank, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions." - "But received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - input_dim[rank - 2], input_dim[rank - 1], - platform::errors::InvalidArgument( - "Eigh op is designed for square matrix, consequently" - "inner-most 2 dimensions of Input(X) should be symmetric." - "But received X's shape[-2] = %d and shape[-1] = %d.", - input_dim[rank - 2], input_dim[rank - 1])); - - std::vector values_dim; - - for (auto i = 0; i < rank - 1; i++) { - values_dim.emplace_back(input_dim[i]); - } - - ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim)); - ctx->SetOutputDim("Eigenvectors", input_dim); - } }; -class EignOpMaker : public framework::OpProtoAndCheckerMaker { +class EighOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", @@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor, + PD_INFER_META(phi::EighInferMeta)); -REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker, +REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker, ops::EighGradOpMaker, - ops::EighGradOpMaker); + ops::EighGradOpMaker, + EighInferShapeFunctor); REGISTER_OPERATOR(eigh_grad, ops::EighGradOp); - -REGISTER_OP_CPU_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CPU_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu deleted file mode 100644 index 827c551637d4df24529508ff37e6a92f157658a0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/eigh_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/eigh_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CUDA_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h deleted file mode 100644 index 5279ec750935c9b1b01584e893cc5e5f85d4a75c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/eigh_op.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/eigen_values_vectors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EighKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("X"); - auto output_w = ctx.Output("Eigenvalues"); - auto output_v = ctx.Output("Eigenvectors"); - std::string lower = ctx.Attr("UPLO"); - bool is_lower = (lower == "L"); - math::MatrixEighFunctor functor; - functor(ctx, *input, output_w, output_v, is_lower, true); - } -}; - -template -class EighGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = phi::dtype::Real; - auto& x_grad = *ctx.Output(framework::GradVarName("X")); - x_grad.mutable_data(ctx.GetPlace()); - auto& output_w = *ctx.Input("Eigenvalues"); - auto& output_v = *ctx.Input("Eigenvectors"); - auto& output_w_grad = - *ctx.Input(framework::GradVarName("Eigenvalues")); - auto& output_v_grad = - *ctx.Input(framework::GradVarName("Eigenvectors")); - - auto& dims = output_v.dims(); - const int m = dims[dims.size() - 1]; - auto dito = - math::DeviceIndependenceTensorOperations( - ctx); - auto tV = dito.Transpose(dito.Conj(output_v)); - auto W = dito.template Sub(dito.Unsqueeze(output_w, -2), - dito.Unsqueeze(output_w, -1)); - Tensor result = dito.Matmul(tV, output_v_grad); - result.mutable_data(dims, ctx.GetPlace()); - std::vector out_shape = phi::vectorize(dims); - auto constant = dito.Fill(out_shape, 0.5); - result = dito.Sub(result, dito.Conj(dito.Transpose(result))); - result = dito.Mul(result, constant); - result = dito.Div(result, W); - result = dito.DiagFill(m, m, m, 0, output_w_grad, result); - x_grad = dito.Matmul(output_v, dito.Matmul(result, tV)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 38cd232e4d1d2237cb5da014d11ba69a91cbe917..13fd9b81a8765aea140ad6ca2fc0383151a51dc7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -102,42 +102,6 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad, ops::ElementwiseDoubleGradOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); - -REGISTER_OP_CPU_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_div) .AddCheckpoint( R"ROC(Register elementwise_div for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu deleted file mode 100644 index 9eb4b0352e5337e3fdd758d2e95cfa61d1d62724..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_div_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = ctx.template device_context(); - const auto place = ctx.GetPlace(); - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, out, y}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, DivGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, DivGradXFunctor()); - } else if (dy != nullptr && dx == nullptr) { - std::vector ins = {dout, out, y}; - GetGradXOrYOut( - dev_ctx, place, axis, ins, dout, dy, DivGradYFunctor()); - } -} - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel>, - ops::ElementwiseDivKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel>, - ops::ElementwiseDivGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_div_grad_grad, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel, - ops::ElementwiseDivDoubleGradKernel>, - ops::ElementwiseDivDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index c58a7f36548a57a1c8e7770fa282470fba4cc140..e9adb9abdb528c187817be641b81ffb6f64833b0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -20,142 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -void default_elementwise_sub(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - SubFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseSubFunctor(), z); - } -} - -template -void default_elementwise_div(const framework::ExecutionContext& ctx, - const framework::Tensor* x, - const framework::Tensor* y, framework::Tensor* z) { - int axis = ctx.Attr("axis"); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - DivFunctor(), z); - } else { - ElementwiseComputeEx, DeviceContext, T>( - ctx, x, y, axis, InverseDivFunctor(), z); - } -} - -template -class ElementwiseDivKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePhiDenseTensor(*x); - auto pt_y = paddle::experimental::MakePhiDenseTensor(*y); - auto pt_z = paddle::experimental::MakePhiDenseTensor(*z); - phi::DivideRawKernel( - static_cast::TYPE&>(dev_ctx), - *pt_x.get(), *pt_y.get(), axis, pt_z.get()); - } -}; - -template -struct DivGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } -}; - -template -struct DivGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout / y_conj; - } -}; - -template -struct DivGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return -dout * out / y; - } -}; - -template -struct DivGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex out_div_y_conj((out / y).real, - -(out / y).imag); - return -dout * out_div_y_conj; - } -}; - -template -struct DivDoubleDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return y * out * dout - x * dout; - } -}; - -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - - ElemwiseGradCompute, DivGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseDivGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseDivGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseDivGrad(ctx, x, y, out, dout, dx, dy); - } -}; - class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -206,80 +70,5 @@ class ElementwiseDivOpDoubleGrad : public framework::OperatorWithKernel { } }; -template -class ElementwiseDivDoubleGradKernel : public framework::OpKernel { - using Tensor = framework::Tensor; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Input("Out"); - auto* ddX = ctx.Input("DDX"); - auto* ddY = ctx.Input("DDY"); - auto* dX = ctx.Input("DX"); - - auto* dY = ctx.Output(framework::GradVarName("Y")); - auto* dOut = ctx.Output("DOut"); - auto* ddOut = ctx.Output("DDOut"); - - int axis = ctx.Attr("axis"); - - if (dY) dY->mutable_data(Y->dims(), ctx.GetPlace()); - if (dOut) dOut->mutable_data(Out->dims(), ctx.GetPlace()); - if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); - - // ddX_safe == null ? 0 : ddX - // ddY_safe == null ? 0 : ddY - Tensor ddX_safe, ddY_safe; - GetDoubleGradSafeTensor(ctx, dX, ddX, &ddX_safe); - GetDoubleGradSafeTensor(ctx, Y, ddY, &ddY_safe); - - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - // dY = Out * dX * ddY / Y - dX * ddX / Y - // dOut = - dX * ddY - // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can - // inplace ddx - Tensor tmp; - if (dOut) { - tmp = *dOut; - } else { - auto& dev_ctx = ctx.template device_context(); - tmp = ctx.AllocateTmpTensor(Out->dims(), dev_ctx); - } - if (dY) { - // dX_div_Y = dX / Y; - Tensor dX_div_Y = tmp; - default_elementwise_div(ctx, dX, Y, &dX_div_Y); - - // NOTE(dengkaipeng): in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - - // dY = Out * dX * ddY / Y - dX * ddX / Y - ElemwiseGradCompute, DivDoubleDY>( - ctx, ddX_safe, ddY_safe, *Out, dX_div_Y, axis, nullptr, dY, - DivGradDX(), DivDoubleDY()); - } - - if (ddOut) { - // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - default_elementwise_mul(ctx, Out, &ddY_safe, &tmp); - default_elementwise_sub(ctx, &ddX_safe, &tmp, &tmp); - default_elementwise_div(ctx, &tmp, Y, ddOut); - } - - if (dOut) { - // dOut = - dX * ddY - default_elementwise_mul(ctx, dX, &ddY_safe, dOut); - auto& place = - *ctx.template device_context().eigen_device(); - auto dout = framework::EigenVector::Flatten(*dOut); - dout.device(place) = static_cast(-1) * dout; - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 86f5be3071c2d1a84f13da1cef74787003e633bb..54931d99292f9d1453e2a3deb72e75ed63c9f46f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -1,11 +1,8 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -90,147 +87,6 @@ struct MinFunctor { template using Complex = paddle::platform::complex; -template -struct DivGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - // dx = dout / y - // dy = - dout * out / y - phi::Array outs; - outs[0] = a / c; - outs[1] = -a * b / c; - return outs; - } -}; - -template -struct DivGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - Complex c_conj(c.real, -c.imag); - Complex out_div_c_conj((b / c).real, -(b / c).imag); - outs[0] = a / c_conj; - outs[1] = -a * out_div_c_conj; - return outs; - } -}; - -// Float div grad -template -struct DivGradXFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; - -// Complex div grad -template -struct DivGradXFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a / b_conj; - } -}; - -// Float mul and div -template -struct DivGradYFunctor { - inline HOSTDEVICE T operator()(const T a, const T b, const T c) const { - return -a * b / c; - } -}; - -// Complex mul and div -template -struct DivGradYFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b, - const Complex c) const { - Complex out_div_c_conj((b / c).real, -(b / c).imag); - return -a * out_div_c_conj; - } -}; - -// Fmax -template -struct FMaxFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmax(a, b); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmax(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMaxFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmax(double_a, double_b); - return std::llrint(result); - } -}; - -// Fmin -template -struct FMinFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { - return std::fmin(a, b); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE paddle::platform::float16 operator()( - const paddle::platform::float16 a, - const paddle::platform::float16 b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return static_cast(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int operator()(const int a, const int b) const { - float float_a = static_cast(a); - float float_b = static_cast(b); - auto result = std::fmin(float_a, float_b); - return std::lrint(result); - } -}; - -template <> -struct FMinFunctor { - inline HOSTDEVICE int64_t operator()(const int64_t a, const int64_t b) const { - double double_a = static_cast(a); - double double_b = static_cast(b); - auto result = std::fmin(double_a, double_b); - return std::llrint(result); - } -}; - template struct MinGradXFunctor { inline HOSTDEVICE T operator()(const T x, const T y, const T dout) const { @@ -257,47 +113,6 @@ struct MinGradXYFunctor { } }; -template -struct MulGradFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; } -}; -template -struct MulGradFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a * b_conj; - } -}; - -template -struct MulGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - phi::Array outs; - // dx = dout * y - outs[0] = a * b; - // dy = dout * x - outs[1] = a * c; - return outs; - } -}; - -template -struct MulGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - // dx = dout * y - Complex b_conj(b.real, -b.imag); - outs[0] = a * b_conj; - // dy = dout * x - Complex c_conj(c.real, -c.imag); - outs[1] = a * c_conj; - return outs; - } -}; - // Ternary compare template struct MaxGradXFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc index 91da732ef0d3dfbda5d9b7734071ec5831bcfa3f..d91315cc511aa80c0e9c44ccc688b2746eac764e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc @@ -151,21 +151,3 @@ REGISTER_OPERATOR(elementwise_fmax, ops::ElementwiseOp, ops::ElementwiseFMaxGradOpMaker); REGISTER_OPERATOR(elementwise_fmax_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index 123332a4a23de5c9534c8523993b87d8738f9869..0d5f56fda17322d86ef13990e9fc2432816dc9cb 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -86,21 +86,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMaxGradKernel, ops::ElementwiseMaxGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel, - ops::ElementwiseFMaxKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmax_grad, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel, - ops::ElementwiseFMaxGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.h b/paddle/fluid/operators/elementwise/elementwise_max_op.h index cff30be50a3d14c646cb7d13d6d8aeeb3de250f4..afe1073d89a06618af95490ac6d264073bd930d4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.h @@ -35,21 +35,6 @@ class ElementwiseMaxKernel : public framework::OpKernel { } }; -template -class ElementwiseFMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMaxFunctor(), z); - } -}; - template struct MaxGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -104,88 +89,5 @@ class ElementwiseMaxGradKernel : public ElemwiseGradKernel { } }; -template -struct FMaxGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x >= y) || isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x >= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x >= y)); - } -}; - -template <> -struct FMaxGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x >= y)); - } -}; - -template -struct FMaxGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x >= y) || isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x >= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template <> -struct FMaxGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x >= y))); - } -}; - -template -class ElementwiseFMaxGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMaxGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMaxGradDx(), - FMaxGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc index 3a1951999546eb859f6299b0bf7b064ff1b90a1a..dad80a2c33f3abfde457a6d750f89e47374fae13 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc @@ -147,21 +147,3 @@ REGISTER_OPERATOR(elementwise_fmin, ops::ElementwiseOp, ops::ElementwiseFMinGradOpMaker); REGISTER_OPERATOR(elementwise_fmin_grad, ops::ElementwiseOpGrad); - -REGISTER_OP_CPU_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CPU_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index 5af985567d898d500b59e10d6032be57871c7e98..fb8bc9ac7f83c8dd99e40685acc68eec4c77b3ce 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -82,21 +82,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMinGradKernel, ops::ElementwiseMinGradKernel); - -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel, - ops::ElementwiseFMinKernel); -REGISTER_OP_CUDA_KERNEL( - elementwise_fmin_grad, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel, - ops::ElementwiseFMinGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.h b/paddle/fluid/operators/elementwise/elementwise_min_op.h index 88fb044d42206eb0f89ac84df166e2e7ff33c5b3..283ad2adde978680d4d0c3a579d55e588368a28e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.h @@ -35,21 +35,6 @@ class ElementwiseMinKernel : public framework::OpKernel { } }; -template -class ElementwiseFMinKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - - z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - FMinFunctor(), z); - } -}; - template struct MinGradDx { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { @@ -124,89 +109,5 @@ class ElementwiseMinGradKernel : public ElemwiseGradKernel { ElementwiseMinGrad(ctx, x, y, out, dout, dx, dy); } }; - -template -struct FMinGradDx { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast((x <= y) || isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - (x <= y) || paddle::platform::isnan(y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast((x <= y)); - } -}; - -template <> -struct FMinGradDx { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast((x <= y)); - } -}; - -template -struct FMinGradDy { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { - return dout * static_cast(!((x <= y) || isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE paddle::platform::float16 operator()( - paddle::platform::float16 x, paddle::platform::float16 y, - paddle::platform::float16 out, paddle::platform::float16 dout) const { - return dout * static_cast( - !((x <= y) || paddle::platform::isnan(y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int operator()(int x, int y, int out, int dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template <> -struct FMinGradDy { - HOSTDEVICE int64_t operator()(int64_t x, int64_t y, int64_t out, - int64_t dout) const { - return dout * static_cast(!((x <= y))); - } -}; - -template -class ElementwiseFMinGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* out = dout; // Fake out, not used - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, FMinGradDy>( - ctx, *x, *y, *out, *dout, axis, dx, dy, FMinGradDx(), - FMinGradDy()); - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index e172279145e28c0731ed0d8d91769d0b293662fe..830e09eeae4811eb44bd4e21e17fe83ee44c592d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseMulKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 45c87a27a180af4798a9f8b31e2edfd0cacb583d..f7b9fd1e265f5d3f107e734f9ffdcc90e7f6cc77 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -63,33 +63,6 @@ class ElementwiseMulKernel } }; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = - ctx.template device_context(); - const auto place = ctx.GetPlace(); - - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, y, x}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, MulGradFunctor()); - } else if (dx == nullptr && dy != nullptr) { - std::vector ins = {dout, x}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dy, MulGradFunctor()); - } -} - } // namespace operators } // namespace paddle @@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index c81266d584468f51030026e1423a649252001f58..58a3123c7e332f50b0830577436528f1e8df1cdf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel { } } }; -template -struct MulGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } -}; - -template -struct MulGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template -struct MulGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } -}; - -template -struct MulGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, MulGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseMulGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = dout; // out is not necessary - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseMulGrad(ctx, x, y, out, dout, dx, dy); - } -}; - -template -class ElementwiseMulDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* ddout = ctx.Output("DDOut"); - - if (ddout) ddout->mutable_data(ctx.GetPlace()); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - // dx = dout * ddy - // dy = dout * ddx - // ddout = ddx * y + x * ddy - // change computation sequence to save memory, so ddout can inplace ddx and - // dx can be used as 'tmp' tensor - // (1) dx = x * ddy - // (2) dy = dout * ddx - // (3) ddout = ddx * y - // (4) ddout = ddout + dx - // (5) dx = dout * ddy - if (ddout) { - int axis = ctx.Attr("axis"); - auto& place = - *ctx.template device_context().eigen_device(); - // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace - if (ddout->numel() > ddx->numel()) { - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX(), - MulGradDY()); - - Tensor ddout_tmp; - ddout_tmp.mutable_data(ddout->dims(), ctx.GetPlace()); - - default_elementwise_mul(ctx, y, &ddx_safe, ddout); - default_elementwise_mul(ctx, &ddy_safe, x, - &ddout_tmp); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - } else { - // use dx to save memory, other than alloc tmp tensor - Tensor* ddout_tmp = dx; - - default_elementwise_mul(ctx, x, &ddy_safe, ddout_tmp); - // NOTE: in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy, - MulGradDX(), MulGradDY()); - default_elementwise_mul(ctx, &ddx_safe, y, ddout); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(*ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - default_elementwise_mul(ctx, dout, &ddy_safe, dx); - } - } - } -}; - -template -class ElementwiseMulTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - // get input - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* d_dx = ctx.Input("D_DX"); - auto* d_dy = ctx.Input("D_DY"); - auto* d_ddout = ctx.Input("D_DDOut"); - - // get output - auto* out_d_x = ctx.Output("D_X"); - auto* out_d_y = ctx.Output("D_Y"); - auto* out_d_dout = ctx.Output("D_DOut"); - - auto* out_d_ddx = ctx.Output("D_DDX"); - auto* out_d_ddy = ctx.Output("D_DDY"); - - if (out_d_x) out_d_x->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_y) out_d_y->mutable_data(y->dims(), ctx.GetPlace()); - if (out_d_dout) out_d_dout->mutable_data(dout->dims(), ctx.GetPlace()); - if (out_d_ddx) out_d_ddx->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_ddy) out_d_ddy->mutable_data(y->dims(), ctx.GetPlace()); - - auto& place = *ctx.template device_context().eigen_device(); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - if (d_ddout) { - if (out_d_x) { - // out_d_x = ddy * d_ddout - default_elementwise_mul(ctx, &ddy_safe, d_ddout, - out_d_x); - } - if (out_d_y) { - // out_d_y = ddx * d_ddout - default_elementwise_mul(ctx, &ddx_safe, d_ddout, - out_d_y); - } - } - - if (out_d_dout) { - // get out_d_dout - // out_d_dout = ddy * d_dx + d_dy * ddx - Tensor out_d_dout_tmp; - out_d_dout_tmp.mutable_data(dout->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, d_dy, &ddx_safe, - out_d_dout); - default_elementwise_mul(ctx, &ddy_safe, d_dx, - &out_d_dout_tmp); - auto out_d_dout_t = framework::EigenVector::Flatten(*out_d_dout); - auto out_d_dout_tmp_t = - framework::EigenVector::Flatten(out_d_dout_tmp); - out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t; - } - - if (out_d_ddx) { - // get out_d_ddx - // out_d_ddx = dout * d_dy + y * d_ddout - Tensor out_d_ddx_tmp; - out_d_ddx_tmp.mutable_data(ddx->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dy, out_d_ddx); - default_elementwise_mul(ctx, y, d_ddout, - &out_d_ddx_tmp); - auto out_d_ddx_t = framework::EigenVector::Flatten(*out_d_ddx); - auto out_d_ddx_tmp_t = framework::EigenVector::Flatten(out_d_ddx_tmp); - out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t; - } - - if (out_d_ddy) { - // get out_d_ddy - // out_d_ddy = dout * d_dx + x * d_ddout - Tensor out_d_ddy_tmp; - out_d_ddy_tmp.mutable_data(ddy->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dx, out_d_ddy); - default_elementwise_mul(ctx, x, d_ddout, - &out_d_ddy_tmp); - auto out_d_ddy_t = framework::EigenVector::Flatten(*out_d_ddy); - auto out_d_ddy_tmp_t = framework::EigenVector::Flatten(out_d_ddy_tmp); - out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t; - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 61862aa9f87408048c5d31a13c0be8a013046902..80b07721f0b4d1feb669bfce91127b0887d79391 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -45,6 +45,7 @@ limitations under the License. */ #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif @@ -145,17 +146,9 @@ void ElemwiseGradCompute(const framework::ExecutionContext &ctx, const framework::Tensor &dout, int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { - const framework::DDim &x_dim = x.dims(); - const framework::DDim &y_dim = y.dims(); const auto &dev_ctx = ctx.template device_context(); - if (x.dims() == y.dims()) { - phi::funcs::ElemwiseGradComputeNoBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } else { - phi::funcs::ElemwiseGradComputeWithBroadcast( - dev_ctx, x_dim, y_dim, x, y, out, dout, axis, dx, dy, dx_op, dy_op); - } + phi::funcs::ElemwiseGradCompute( + dev_ctx, x, y, out, dout, axis, dx, dy, dx_op, dy_op); } // It is a common implementation to compute binary calculation with the support @@ -1174,14 +1167,6 @@ static inline std::vector GetReduceDim(const framework::DDim &in, } #if defined(__NVCC__) || defined(__HIPCC__) -template -void ReduceWrapper(const platform::CUDADeviceContext &dev_ctx, int axis, - framework::Tensor *src, framework::Tensor *dst) { - std::vector reduce_dims = GetReduceDim(dst->dims(), src->dims(), axis); - TensorReduceImpl>( - dev_ctx, *src, dst, kps::IdentityFunctor(), reduce_dims, - dev_ctx.stream()); -} template void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, @@ -1189,36 +1174,8 @@ void GetGradXAndYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dx, framework::Tensor *dy, Functor func) { - framework::Tensor tmp_dx; - framework::Tensor tmp_dy; - dx->mutable_data(place); - dy->mutable_data(place); - std::vector outs; - if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) { - outs = {dx, dy}; - } else if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, dy}; - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - outs = {dx, &tmp_dy}; - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - tmp_dy.mutable_data(dout->dims(), place); - tmp_dx.mutable_data(dout->dims(), place); - outs = {&tmp_dx, &tmp_dy}; - } - - paddle::operators::LaunchElementwiseCudaKernel( - dev_ctx, ins, &outs, axis, func); - - if (dx->dims() != dout->dims() && dy->dims() == dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - } else if (dx->dims() == dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } else if (dx->dims() != dout->dims() && dy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dx, dx); - ReduceWrapper(dev_ctx, axis, &tmp_dy, dy); - } + phi::GetGradXAndYOut(dev_ctx, place, axis, ins, *dout, dx, dy, + func); } template @@ -1227,22 +1184,8 @@ void GetGradXOrYOut(const platform::CUDADeviceContext &dev_ctx, std::vector ins, const framework::Tensor *dout, framework::Tensor *dxy, Functor func) { - framework::Tensor tmp_dxy; - dxy->mutable_data(place); - - std::vector outs; - if (dxy->dims() != dout->dims()) { - tmp_dxy.mutable_data(dout->dims(), place); - outs = {&tmp_dxy}; - } else { - outs = {dxy}; - } - - paddle::operators::LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, - axis, func); - if (dxy->dims() != dout->dims()) { - ReduceWrapper(dev_ctx, axis, &tmp_dxy, dxy); - } + phi::GetGradXOrYOut(dev_ctx, place, axis, ins, *dout, dxy, + func); } #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index 1f8a95f0286bd3bb228bcda59e1198bf0763eb9a..3e9263fe93acd93638ff9e496203b7ea432cea86 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -33,7 +32,7 @@ namespace p = paddle::platform; USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); -USE_OP(elementwise_sub); +USE_OP_ITSELF(elementwise_sub); USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); template diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc index 14b20baae1b0398a40ee74a3e16c2c992a4b557e..78855dd39572539e531bcd8ad3786ae95269ca8f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_xpu.cc @@ -14,7 +14,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index b2cef95d1a349d66161db1c3edf7c14bc8a6d058..d15a7c272757fa683f835215e3db9ccec956af38 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" - #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" @@ -78,10 +76,16 @@ class ElementwiseSubDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle REGISTER_ELEMWISE_GRAD_MAKER(elementwise_sub, Sub); -REGISTER_ELEMWISE_EXPLICIT_OP_WITHOUT_GRAD(elementwise_sub, Sub); namespace ops = paddle::operators; +REGISTER_OPERATOR(elementwise_sub, ::paddle::operators::ElementwiseOp, + ::paddle::operators::ElementwiseSubOpMaker, + ::paddle::operators::ElementwiseOpInferVarType, + elementwise_subGradMaker<::paddle::framework::OpDesc>, + elementwise_subGradMaker<::paddle::imperative::OpBase>, + ::paddle::operators::ElementwiseOpInplaceInferer); + REGISTER_OPERATOR( elementwise_sub_grad, ops::ElementwiseOpGrad, ops::ElementwiseGradOpInplaceInferer, ops::ElementwiseGradNoBufVarsInferer, @@ -92,51 +96,6 @@ REGISTER_OPERATOR(elementwise_sub_grad_grad, ops::ElementwiseDoubleGradOpInplaceInferer, ops::ElementwiseDoubleGradNoBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel>, - ops::ElementwiseSubKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel>, - ops::ElementwiseSubGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_sub_grad_grad, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel>, - ops::ElementwiseSubDoubleGradKernel>); - REGISTER_OP_VERSION(elementwise_sub) .AddCheckpoint( R"ROC(Register elementwise_sub for adding the attribute of Scale_y)ROC", diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu deleted file mode 100644 index 2c962af9877b978f7a6af25635f345c0ae5ffd27..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel>, - ops::ElementwiseSubKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel>, - ops::ElementwiseSubGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_sub_grad_grad, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel, - ops::ElementwiseSubDoubleGradKernel>, - ops::ElementwiseSubDoubleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h deleted file mode 100644 index 15c547b493ae045c13ab8d6b14a646cb92716a92..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ /dev/null @@ -1,96 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/platform/place.h" - -#include "paddle/phi/kernels/elementwise_grad_kernel.h" -#include "paddle/phi/kernels/math_kernel.h" -namespace paddle { -namespace operators { - -template -class ElementwiseSubKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.device_context(); - int axis = ctx.Attr("axis"); - phi::SubtractRawKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, axis, z); - } -}; - -template -class ElementwiseSubGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - int axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.device_context(); - - phi::SubtractGradKernel( - static_cast::TYPE&>(dev_ctx), - *x, *y, *dout, axis, dx, dy); - } -}; - -template -class ElementwiseSubDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* ddout = ctx.Output("DDOut"); - int axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.device_context(); - - paddle::optional ddx_optional = paddle::none; - paddle::optional ddy_optional = paddle::none; - if (ddx != nullptr) { - ddx_optional = *ddx; - } - if (ddy != nullptr) { - ddy_optional = *ddy; - } - phi::SubtractDoubleGradKernel( - static_cast::TYPE&>(dev_ctx), - *y, ddx_optional, ddy_optional, *dout, axis, ddout); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index b68d38d6df12a5d11f57b1556f8fc7ceec00d3e0..4169a938f2d0bff0cf8b23db35c943c9ff586212 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc index d12c6fc30cebaafd27c099ab708e0662477cb017..87c494b0e10bad64566b5248946c9b8b1b778f2f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_xpu.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_xpu.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 763fc5f2674104a718e33f5ef5ac7b2a1a7b23f5..ad8fd317013908e8908dff8bea3440e24779454e 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -32,6 +32,45 @@ using dnnl::stream; template class EltwiseMKLDNNKernel : public framework::OpKernel { + private: + dnnl::post_ops get_post_ops(const framework::ExecutionContext& ctx) const { + dnnl::post_ops post_operations; + if (ctx.HasAttr("activation_type")) { + const float scale = ctx.HasAttr("activation_scale") + ? ctx.Attr("activation_scale") + : 1.0f; + const float alpha = ctx.HasAttr("activation_alpha") + ? ctx.Attr("activation_alpha") + : 0.0f; + const float beta = ctx.HasAttr("activation_beta") + ? ctx.Attr("activation_beta") + : 0.0f; + + static std::unordered_map algo_map = { + {"relu", dnnl::algorithm::eltwise_relu}, + {"tanh", dnnl::algorithm::eltwise_tanh}, + {"leaky_relu", dnnl::algorithm::eltwise_relu}, + {"swish", dnnl::algorithm::eltwise_swish}, + {"hardswish", dnnl::algorithm::eltwise_hardswish}, + {"sqrt", dnnl::algorithm::eltwise_sqrt}, + {"abs", dnnl::algorithm::eltwise_abs}, + {"clip", dnnl::algorithm::eltwise_clip}, + {"gelu", dnnl::algorithm::eltwise_gelu_erf}, + {"gelu_tanh", dnnl::algorithm::eltwise_gelu_tanh}, + {"relu6", dnnl::algorithm::eltwise_bounded_relu}, + {"sigmoid", dnnl::algorithm::eltwise_logistic}}; + + const auto& activation_type = + algo_map.find(ctx.Attr("activation_type")); + + if (activation_type != algo_map.end()) { + post_operations.append_eltwise(scale, activation_type->second, alpha, + beta); + } + } + return post_operations; + } + public: void Compute(const framework::ExecutionContext& ctx) const override { const auto& dev_ctx = @@ -47,9 +86,9 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - platform::BinaryMKLDNNHandler handler(BINARY_OP, axis, mkldnn_engine, - ctx.GetPlace(), x, y, z, scale_x, - scale_y, scale_o); + platform::BinaryMKLDNNHandler handler( + BINARY_OP, axis, mkldnn_engine, ctx.GetPlace(), x, y, z, scale_x, + scale_y, scale_o, get_post_ops(ctx)); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc index 5222103256d614a2d6b1fa10662367ecb20d3cb2..ea009a38056f078689bd6dc4c9a41d2b34e8c1fa 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -17,8 +17,13 @@ #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT); +#endif namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index 9d4d11609ac2047aa8934cb2868f79359a816e12..ce5c6b701d95894db8e3a84215f537352914706a 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -21,9 +21,12 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 9aa206efed8c0111f56b6651e0228acc316b1bfe..3cecc52a3c481cf9cb4a1e2eba6ded704a8fa8ee 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -27,8 +27,14 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" -USE_OP(elementwise_div); +USE_OP_ITSELF(elementwise_div); + +PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT); +#endif namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/empty_op.cc b/paddle/fluid/operators/empty_op.cc index e23342ebb5dc7639d68500964bfdfbd099d077cd..9e0e4e7fe1c6d26df7c4347d8bc81a985e6c973b 100644 --- a/paddle/fluid/operators/empty_op.cc +++ b/paddle/fluid/operators/empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/empty_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/infermeta/nullary.h" + namespace paddle { namespace operators { @@ -51,46 +53,6 @@ class EmptyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* context) const override { - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "empty"); - - if (context->HasInput("ShapeTensor")) { - auto shape_dims = context->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - context->SetOutputDim("Out", phi::make_ddim(vec_dims)); - } else if (context->HasInputs("ShapeTensorList")) { - std::vector out_dims; - auto dims_list = context->GetInputsDim("ShapeTensorList"); - for (size_t i = 0; i < dims_list.size(); ++i) { - auto& dims = dims_list[i]; - PADDLE_ENFORCE_EQ(dims, phi::make_ddim({1}), - platform::errors::InvalidArgument( - "The shape of Tensor in list must be [1]. " - "But received the shape is [%s]", - dims)); - - out_dims.push_back(-1); - } - - context->SetOutputDim("Out", phi::make_ddim(out_dims)); - } else { - auto& shape = context->Attrs().Get>("shape"); - for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE_GE( - shape[i], 0, - platform::errors::InvalidArgument( - "Each value of attribute 'shape' is expected to be no less " - "than 0. But recieved: shape[%u] = %d; shape = [%s].", - i, shape[i], phi::make_ddim(shape))); - } - context->SetOutputDim("Out", phi::make_ddim(shape)); - } - } - protected: framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const framework::Tensor& tensor, @@ -126,14 +88,8 @@ class EmptyOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OPERATOR( - empty, ops::EmptyOp, ops::EmptyOpMaker, ops::EmptyOpVarTypeInference, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(empty, ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel); +DECLARE_INFER_SHAPE_FUNCTOR(empty, EmptyInferShapeFunctor, + PD_INFER_META(phi::CreateInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(empty, ops::EmptyOp, ops::EmptyOpMaker, + ops::EmptyOpVarTypeInference, + EmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/empty_op.cu.cc b/paddle/fluid/operators/empty_op.cu.cc deleted file mode 100644 index 22799e507aeff7940274f729b174f50bfd9132a5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/empty_op.cu.cc +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/empty_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - empty, ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel, - ops::EmptyKernel); diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h deleted file mode 100644 index cb466fffcd7c7358b6e84c18b7895a17b2eaa907..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/empty_op.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/utils.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EmptyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor *out_tensor = context.Output("Out"); - - auto shape = GetShape(context); - out_tensor->Resize(shape); - - out_tensor->mutable_data(context.GetPlace(), - framework::TransToPhiDataType(dtype)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc index f68f670394871114369f8b05b7f958c03d5508d0..64274d098c0585c28196743c09d5e6c78c3fe37d 100644 --- a/paddle/fluid/operators/erf_op.cc +++ b/paddle/fluid/operators/erf_op.cc @@ -16,8 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/erf_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of ErfOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of ErfOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker, ops::ErfGradOpMaker, - ops::ErfGradOpMaker); + ops::ErfGradOpMaker, + ErfInferShapeFunctor); REGISTER_OPERATOR(erf_grad, ops::ErfGradOp); -REGISTER_OP_CPU_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CPU_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); - -REGISTER_OP_CUDA_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CUDA_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h deleted file mode 100644 index 4780b2e7f5b28d4a743f6d35046891b30cbefd00..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/erf_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -template -class ErfKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - EigenErf, T>::Eval(place, eigen_out, - eigen_in); - } -}; - -template -class ErfGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - EigenErfGrad, T>::Eval(place, eigen_dx, - eigen_x, eigen_dout); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc index 3d409b4c4f6772bc7b234208e78c5088eeb2fc00..374b00792622f91edc0b66cebb278cc79f30dc66 100644 --- a/paddle/fluid/operators/erfinv_op.cc +++ b/paddle/fluid/operators/erfinv_op.cc @@ -73,8 +73,8 @@ DECLARE_INPLACE_OP_INFERER(ErfinvInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(erfinv, ErfinvInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR( erfinv, paddle::operators::ErfinvOp, paddle::operators::ErfinvOpMaker, diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 119e514a49e28fb3295e36947664770889bbdd81..97a35a34f23e96707269482e29da13a15538cdca 100755 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -121,37 +121,9 @@ REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker, ops::ExpandAsV2GradOpMaker); REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp, ops::ExpandAsV2GradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CPU_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#endif REGISTER_OP_VERSION(expand_as_v2) .AddCheckpoint( R"ROC(fix expand_as_v2 and add new input [Y])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( - "Y", "Expand X according to the shape of Y")); \ No newline at end of file + "Y", "Expand X according to the shape of Y")); diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h index d7560efc5c1f1244ae4eed4c68c59a38287057ee..f09e7764eed3959c7f0ca700b953dbd0c2891d12 100755 --- a/paddle/fluid/operators/expand_as_v2_op.h +++ b/paddle/fluid/operators/expand_as_v2_op.h @@ -32,219 +32,5 @@ template using EigenTensor = framework::EigenTensor; -template -class ExpandAsV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - auto target_shape = context.Attr>("target_shape"); - auto target_rank = target_shape.size(); - PADDLE_ENFORCE_GE(target_rank, rank, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be greater than or equal to " - "the rank (%d) of the input 'x'.", - target_rank, rank)); - PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); - PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be less than or equal to %d.", - target_rank, MAX_RANK_SUPPORTED)); - - switch (target_rank) { - case 1: - ExpandAs<1>(context); - break; - case 2: - ExpandAs<2>(context); - break; - case 3: - ExpandAs<3>(context); - break; - case 4: - ExpandAs<4>(context); - break; - case 5: - ExpandAs<5>(context); - break; - case 6: - ExpandAs<6>(context); - break; - } - } - - protected: - template - void ExpandAs(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto target_shape = context.Attr>("target_shape"); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(target_shape[i], 0, - platform::errors::InvalidArgument( - "The value of target shape cannot be zero.")); - if (i < diff) { - PADDLE_ENFORCE_GT( - target_shape[i], 0, - platform::errors::InvalidArgument( - "The expanded size (%d) for non-existing dimensions must be " - "positive for expand_as_v2 op.", - target_shape[i])); - repeat_times[i] = target_shape[i]; - } else if (target_shape[i] > 0) { - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], target_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in shape for expand_as_v2 op.", - vec_in_dims[i], target_shape[i])); - repeat_times[i] = 1; - } else { - repeat_times[i] = target_shape[i]; - } - } else { - PADDLE_ENFORCE_EQ( - target_shape[i], -1, - platform::errors::InvalidArgument( - "When the value in shape is negative for expand_as_v2 op, " - "only -1 is supported, but the value received is %d.", - target_shape[i])); - repeat_times[i] = 1; - } - } - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims = phi::make_ddim(target_shape); - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } -}; - -template -class ExpandAsV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("X"); - auto target_shape = context.Attr>("target_shape"); - auto x_dims = in0->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - repeat_times[i] = target_shape[i] / vec_in_dims[i]; - } - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - framework::TensorCopy(*in0, context.GetPlace(), context.device_context(), - out0); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be greater than or " - "equal to 1, but the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void ExpandAsBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index cdd4e1dbaae6a6a74bb11be44589877234021764..df00ae54c1036b1b0f0899eb0a949d58c398aa48 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/exponential_op.cc b/paddle/fluid/operators/exponential_op.cc index ee456dcdafbc51d547e7beacc4e4e79f98738b88..1a48a6767852e138e7725a68ca4ffc56de8234be 100644 --- a/paddle/fluid/operators/exponential_op.cc +++ b/paddle/fluid/operators/exponential_op.cc @@ -76,7 +76,7 @@ class ExponentialKernel auto engine = gen->GetCPUEngine(); std::uniform_real_distribution uniform(0.0, 1.0); - distribution::exponential_transform trans(lambda); + phi::funcs::exponential_transform trans(lambda); for (int64_t i = 0; i < size; ++i) { out_data[i] = trans(uniform(*engine)); } diff --git a/paddle/fluid/operators/exponential_op.cu b/paddle/fluid/operators/exponential_op.cu index 8b989501e4f4248b0c2e3b23e1e75a4865b08588..d5abbf9a26afe6bcbbd8549f59d632fc4e53fec2 100644 --- a/paddle/fluid/operators/exponential_op.cu +++ b/paddle/fluid/operators/exponential_op.cu @@ -26,9 +26,9 @@ class ExponentialKernel auto& dev_cxt = ctx.template device_context(); T lambda = static_cast(ctx.Attr("lambda")); - distribution::uniform_distribution dist; - distribution::exponential_transform trans(lambda); - distribution::distribution_and_transform(dev_cxt, out, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::exponential_transform trans(lambda); + phi::funcs::distribution_and_transform(dev_cxt, out, dist, trans); } }; diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h index fbcabc594db0814da1ec50934a0f02514dc208be..7ded174a9f47ede48a49b19b25539867ce344fb0 100644 --- a/paddle/fluid/operators/exponential_op.h +++ b/paddle/fluid/operators/exponential_op.h @@ -17,7 +17,7 @@ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distribution_helper.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc index 8f8a0f174a79f13f0bee7aa7b425f8c645e15687..537c218d357b67980216ab3053707b8adb867c01 100644 --- a/paddle/fluid/operators/eye_op.cc +++ b/paddle/fluid/operators/eye_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -21,24 +24,6 @@ class EyeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of EyeOP should not be null.")); - auto num_rows = ctx->Attrs().Get("num_rows"); - PADDLE_ENFORCE_EQ( - num_rows >= 0, true, - platform::errors::InvalidArgument( - "The value of Input(num_rows) should be non-negative int.")); - auto num_columns = ctx->Attrs().Get("num_columns"); - if (num_columns == -1) num_columns = num_rows; - PADDLE_ENFORCE_EQ( - num_columns >= 0, true, - platform::errors::InvalidArgument( - "The value of Input(num_columns) should be non-negative int.")); - ctx->SetOutputDim("Out", {num_rows, num_columns}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -82,8 +67,11 @@ Return an identity tensor whose shape is [num_rows, num_columns]. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(eye, EyeInferShapeFunctor, + PD_INFER_META(phi::EyeInferMeta)); REGISTER_OPERATOR( eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + EyeInferShapeFunctor); diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 0eb84f18f25f03b1fd0310c5815ee342ff835a6f..27a235765227f15dd412dcd6ad55f2a24471c6da 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/attn_feed_forward.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,6 +30,11 @@ namespace platform = paddle::platform; USE_OP(matmul); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +#endif + // get paddle matmul op results as baseline template void GetLinearOp(const std::vector &x, const std::vector &y, diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 79018f2a97448a8c6265a969dad37bce77d1b7ee..cb03add3143278260d41c3893e7adad976908d4e 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { tensor_value.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_value, value); NpuOpRunner runner; -#if (CANN_VERSION_CODE >= 503003) +#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001) runner.SetType("FillD") .AddInput(tensor_value) .AddOutput(*out_var) diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..7870efba4e7a1a285bbd4b28b04c2b15f263c347 --- /dev/null +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -0,0 +1,597 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000 + +#if defined(PADDLE_WITH_CUDA) +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/operators/filter_by_instag_op.h" + +#if defined(PADDLE_WITH_CUDA) +namespace cg = cooperative_groups; +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = phi::SelectedRows; +using LoDTensor = framework::LoDTensor; + +template +using Vector = framework::Vector; + +#define WARP_SIZE 32 +#define MAX_WARP_NUM 32 + +#if defined(PADDLE_WITH_CUDA) + +template +__global__ void filter_copy_fuse_kernel( + const size_t N, const int ins_per_thread, size_t* x1_lods_data, + size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data, + int64_t filter_tag_size, T* out_data, int64_t* map_data, + size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data, + const T* x1_data, int x1_embed_size, float* loss_weight_data, + float fill_value) { + // N is instance num + // one threads for ins_per_thread instances + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + cg::thread_block b = cg::this_thread_block(); + cg::thread_block_tile g = cg::tiled_partition(b); + + int gid = idx / WARP_SIZE; + + // general use + int thread_num = + (N + (ins_per_thread - 1)) / ins_per_thread; // real thread num + int total_warp_num = thread_num / WARP_SIZE; // 30 + int remain_thread_num = thread_num % WARP_SIZE; // 16 + + int warp_thread_num = -1; + if (gid < total_warp_num) { + warp_thread_num = WARP_SIZE; + } else { + warp_thread_num = remain_thread_num; + } + + int group_num = total_warp_num; + if (remain_thread_num > 0) { + group_num = total_warp_num + 1; + } + + if (gid >= group_num) return; + + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (N < ins_end) ins_end = N; + + int flag_data[5]; + int prefix_sum_data[5]; + int prefix_sum_data2[5]; + + __shared__ int shr[MAX_WARP_NUM]; + __shared__ int shr2[MAX_WARP_NUM]; + __shared__ int shr3[MAX_WARP_NUM]; + + for (int p = ins_start; p < ins_end; p++) { + int ins_tag_start = x2_lods_data[p]; + int ins_tag_end = x2_lods_data[p + 1]; + flag_data[p - ins_start] = 0; + // filter logic + int i = ins_tag_start; + for (; i < ins_tag_end; i++) { + int64_t ins_tag = x2_data[i]; + int j = 0; + for (; j < filter_tag_size; j++) { + if (x3_data[j] == ins_tag) break; + } + // if ins_tag in filter tag + if (j < filter_tag_size) { + flag_data[p - ins_start] = 1; + break; + } + } + } + + int sum_addr = 0; + int sum_flag = 0; + int sum_out_lods = 0; + + int local_addr = 0; + int local_flag = 0; + int local_out_lods = 0; + + if (ins_start < ins_end) { + for (int p = ins_start; p < ins_end; p++) { + int previous = -1; + if (p == ins_start) { + previous = 0; + } else { + previous = prefix_sum_data[p - ins_start - 1]; + } + + prefix_sum_data[p - ins_start] = + previous + + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + local_addr = prefix_sum_data[ins_end - 1 - ins_start]; + sum_addr = local_addr; + + for (int p = ins_start; p < ins_end; p++) { + local_flag += flag_data[p - ins_start]; + } + sum_flag = local_flag; + + for (int p = ins_start; p < ins_end; p++) { + local_out_lods += + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + sum_out_lods = local_out_lods; + } + + for (int i = 1; i < warp_thread_num; i *= 2) { + int temp_addr = g.shfl_up(sum_addr, i); + int temp_flag = g.shfl_up(sum_flag, i); + int temp_out_lods = g.shfl_up(sum_out_lods, i); + + if (g.thread_rank() >= i) { + sum_addr += temp_addr; + sum_flag += temp_flag; + sum_out_lods += temp_out_lods; + } + } + + if (g.thread_rank() == warp_thread_num - 1) { + shr[gid] = sum_addr; + shr2[gid] = sum_flag; + shr3[gid] = sum_out_lods; + } + + b.sync(); + + int sum_addr2 = 0; + int sum_flag2 = 0; + int sum_out_lods2 = 0; + + // communicate between warp + if (g.thread_rank() < group_num) { + sum_addr2 = shr[g.thread_rank()]; + sum_flag2 = shr2[g.thread_rank()]; + sum_out_lods2 = shr3[g.thread_rank()]; + } + + for (int i = 1; i < group_num; i *= 2) { + int temp_addr2 = g.shfl_up(sum_addr2, i); + int temp_flag2 = g.shfl_up(sum_flag2, i); + int temp_out_lods2 = g.shfl_up(sum_out_lods2, i); + + if (g.thread_rank() >= i) { + sum_addr2 += temp_addr2; + sum_flag2 += temp_flag2; + sum_out_lods2 += temp_out_lods2; + } + } + + int sum_addr3 = g.shfl(sum_addr2, gid); + int sum_flag3 = g.shfl(sum_flag2, gid); + int sum_out_lods3 = g.shfl(sum_out_lods2, gid); + + int p_flag; + int p_addr; + int p_out_lods; + + if (ins_start < ins_end) { + p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr; + p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag; + p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods; + + for (int p = ins_start; p < ins_end; p++) { + if (ins_start == p) { + prefix_sum_data2[p - ins_start] = p_addr; + } else { + prefix_sum_data2[p - ins_start] = + prefix_sum_data2[p - ins_start - 1] + + flag_data[p - ins_start - 1] * + (x1_lods_data[p] - x1_lods_data[p - 1]); + } + } + + if (gid == 0 && g.thread_rank() == group_num - 1) { + *out_idx_data = (sum_flag2 + 1); + map_lods_data[sum_flag2] = sum_flag2; + } + } + + int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1); + + if (ins_start < ins_end) { + int out_lods_idx = p_flag + 1; + for (int p = ins_start; p < ins_end; p++) { + if (flag_data[p - ins_start] == 1) { + size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; + int t = out_lods_idx - 1; + int previous; + if (out_lods_idx == p_flag + 1) { + previous = p_out_lods; + } else { + previous = out_lods_data[t]; + } + map_data[t * 3] = (int64_t)previous; + map_data[t * 3 + 1] = x1_lods_data[p]; + map_lods_data[t] = t; + out_lods_data[out_lods_idx] = previous + batch_len; + map_data[t * 3 + 2] = batch_len; + out_lods_idx++; + } + } + + // fill loss_weight_data + if (sum_out_lods4 > 1) { + int out_data_num = sum_out_lods4 - 1; + int out_start = ins_start; + if (out_start < out_data_num) { + int out_end = ins_end >= out_data_num ? out_data_num : ins_end; + for (int p = out_start; p < out_end; p++) { + loss_weight_data[p] = fill_value; + } + } + } + + for (int p = ins_start; p < ins_end; p++) { + // copy logic + if (flag_data[p - ins_start] == 1) { + auto output_start_idx = prefix_sum_data2[p - ins_start]; + T* dst = out_data + output_start_idx * x1_embed_size; + const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; + const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } + } + } + + b.sync(); +} + +template +__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, + const T* out_grad_data, T* x1_grad_data, + const int64_t* map_data, int x1_embed_size) { + // N is instance num + // one threads for one instance + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + if (ins_start >= N) { + return; + } + if (ins_end > N) ins_end = N; + for (int p = ins_start; p < ins_end; p++) { + T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; + const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; + const T* src_end = + out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size; + + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } +} + +#endif + +template +class FilterByInstagGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + + gpuStream_t current_stream = context.cuda_device_context().stream(); + + int max_thread_num_per_block = 1024; + // context.cuda_device_context().GetMaxThreadsPerBlock(); + // X1 is global FC output + // Dim [batch size, embedding size] + const LoDTensor* x1 = context.Input("Ins"); + bool is_lod = context.Attr("is_lod"); + + int is_x1_lod = -1; + if (is_lod) + is_x1_lod = 1; + else + is_x1_lod = 0; + + int64_t out_val_if_empty = context.Attr("out_val_if_empty"); + size_t x1_embed_size = x1->dims()[1]; + // X2 is ins tag list + // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] + const LoDTensor* x2 = context.Input("Ins_tag"); + // expected auto = const int64_t + const int64_t* x2_data = x2->data(); + + // X3 is local fc tag list + // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] + const Tensor* x3 = context.Input("Filter_tag"); + const int64_t* x3_data = x3->data(); + + Vector x2_lods; + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_per_num = x2->dims()[1]; + // x2_lods.resize(x2->dims()[0] + 1); + // move to cuda + x2_lods.push_back(0); + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(x2_lods.back() + instag_per_num); + } + } + + const size_t x2_lods_size = x2_lods.size() - 1; + paddle::framework::MixVector mixv_x2_lods(&x2_lods); + + size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); + + Vector x1_lods; + if (!is_x1_lod) { + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } else { + // x1_lods = context.Input("Ins")->lod()[0]; + // new: lod_level=0 => lod() return {} + if (x1->lod().size() != 0) { // lod_level = 1 + x1_lods = x1->lod()[0]; + } else { // lod_level = 0 + // x1_lods.resize(x1->dims()[0] + 1); + // move to cuda + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } + } + + paddle::framework::MixVector mixv_x1_lods(&x1_lods); + + size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place); + auto* x1_data = x1->data(); + + // set output value + // for those whose ins been dropout, set 0 for whole lines. + // otherwise, copy whole line + // Dim [local fc count, batch size, embedding size] + LoDTensor* out = context.Output("Out"); + LoDTensor* map = context.Output("IndexMap"); + LoDTensor* loss_weight = context.Output("LossWeight"); + + int out_first = x1_lods.back(); + + out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); + loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1})); + + T* out_data = out->mutable_data(gpu_place); + int64_t* map_data = map->mutable_data(gpu_place); + float* loss_weight_data = loss_weight->mutable_data(gpu_place); + + int block_size = max_thread_num_per_block; + int ins_per_thread = (x2_lods_size + block_size - 1) / block_size; + dim3 block_dim(block_size); + dim3 grid_dim(1); + + Vector out_lods(x2_lods_size + 1, 0); + Vector map_lods(x2_lods_size + 1, 0); + + paddle::framework::MixVector mixv_out_lods(&out_lods); + paddle::framework::MixVector mixv_map_lods(&map_lods); + + // thrust::device_vector out_idx(1); + Vector out_idx(1, 0); + paddle::framework::MixVector mixv_out_idx(&out_idx); + + size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place); + size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place); + size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place); + + float fill_value = 1.0; + + filter_copy_fuse_kernel<<>>( + x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data, + x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data, + out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value); + + platform::GpuStreamSync(current_stream); + + mixv_out_lods.resize(mixv_out_idx[0]); + + if (mixv_out_lods.size() - 1 > 0) { + out->Resize(phi::make_ddim( + {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size})); + + map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3})); + loss_weight->Resize( + phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1})); + + } else { + out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({1, 3})); + loss_weight->Resize(phi::make_ddim({1, 1})); + } + + if (mixv_out_lods.size() - 1 > 0) { + map_lods.resize(mixv_out_lods.size()); + + mixv_map_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + + map->set_lod(map_lod_info); + loss_weight->set_lod(map_lod_info); + + mixv_out_lods.CopyToCPU(); + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + } else { + Vector map_lods(2, 0); + paddle::framework::MixVector mixv_map_lods(&map_lods); + thrust::device_ptr map_data_ptr(map_data); + + map_data_ptr[0] = 0; + map_data_ptr[1] = 1; + map_data_ptr[2] = 1; + + mixv_map_lods[0] = 0; + mixv_map_lods[1] = 1; + mixv_out_lods.push_back(1); + + mixv_map_lods.CopyToCPU(); + mixv_out_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + map->set_lod(map_lod_info); + + loss_weight->set_lod(map_lod_info); + + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + thrust::device_ptr out_data_ptr(out_data); + + // gpu kernel + if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } + + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + loss_weight_data_ptr[0] = 0; + } + +#endif + } +}; + +template +class FilterByInstagGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + gpuStream_t current_stream = context.cuda_device_context().stream(); + auto max_thread_num_per_block = 1024; + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* x1_grad = context.Output(framework::GradVarName("Ins")); + auto* loss_weight = context.Input("LossWeight"); + auto* mmap = context.Input("IndexMap"); + auto* x1 = context.Input("Ins"); + + x1_grad->set_lod(context.Input("Ins")->lod()); + x1_grad->Resize(x1->dims()); + + auto* mmap_data = mmap->data(); + // expected auto = T + auto* output_grad_data = output_grad->data(); + auto* loss_weight_data = loss_weight->data(); + + // expected auto = T + auto* x1_grad_data = x1_grad->mutable_data(gpu_place); + thrust::device_ptr x1_grad_data_ptr(x1_grad_data); + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + + thrust::fill(x1_grad_data_ptr, + x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0); + + if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) { + auto output_dims = output_grad->dims(); + int x1_embed_size = output_dims[1]; + + // one thread for multi-instances + int block_size = max_thread_num_per_block; + + size_t N = mmap->dims()[0]; + dim3 block_dim(block_size); + + dim3 grid_dim((N + block_size - 1) / block_size); + + const int ins_per_thread = 1; + + copy_grad_kernel<<>>( + N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data, + x1_embed_size); + + cudaStreamSynchronize(current_stream); + } + +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel); + +REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel); diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index deb2aa96b539e360cf2edad97b21cb6e9ddba066..3abc980ceaafc3719c13cad51c346282be2c694f 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel { // expected auto = const int64_t auto* x2_data = x2->data(); // e.g get [0, 1, 2, 3, ...] - size_t x2_lods_size = x2->dims()[0]; + // size_t x2_lods_size = x2->dims()[0]; + // size_t instag_num_per_ins = x2->dims()[1]; + + Vector x2_lods(1, 0); + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_num_per_ins = x2->dims()[1]; + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(x2_lods.back() + instag_num_per_ins); + } + } + Vector x1_lods(1, 0); if (!is_x1_lod) { for (int i = 0; i < x1->dims()[0]; i++) { @@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel { } std::unordered_map mmap_aux; Vector out_lods(1, 0); - for (size_t i = 0; i < x2_lods_size; i++) { - for (size_t j = i; j < i + 1; j++) { + for (size_t i = 0; i < x2_lods.size() - 1; i++) { + for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { if (filter_tag.find(x2_data[j]) != filter_tag.end()) { size_t batch_len = x1_lods[i + 1] - x1_lods[i]; mmap_aux[out_lods.back()] = x1_lods[i]; @@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel { out_data[oi] = (int32_t)out_val_if_empty; } else if (std::is_same::value) { out_data[oi] = (int64_t)out_val_if_empty; - } else { + } else if (std::is_same::value) { out_data[oi] = static_cast(out_val_if_empty); + } else { + out_data[oi] = static_cast(out_val_if_empty); } } loss_weight_data[0] = 0; diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 5ef13b38c8a86e16cefdc97be6934b313fdb7bc4..feae954e355b85f5a18f8a48919770fd46a73f70 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/phi_utils.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/flatten_grad_kernel.h" diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 40ec9aef190ff4bacd52b19a1c0b12300a35b61e..92f59e118c3b7bb66a2c5c76d66109ddf04ee076 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel { "but recieved strides_height: %d strides_width: %d.", strides[0], strides[1])); // check dilations + PADDLE_ENFORCE_GT(output_height, 1, + platform::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but recieved output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, 1, + platform::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but recieved output_width: %d .", + output_width)); + // check output size PADDLE_ENFORCE_GT( dilation_height, 0, platform::errors::InvalidArgument( @@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel { output_width)); PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, in_dims[1], + blocks_height * blocks_width, in_dims[2], platform::errors::InvalidArgument( "Given input output_size (%d, %d), " "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " @@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel { strides[0], strides[1], dilations[0], dilations[1], blocks_height, blocks_width, blocks_height * blocks_width, in_dims[2])); + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0, + platform::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], kernel_sizes[0], kernel_sizes[1])); + out_dims.push_back(output_height); out_dims.push_back(output_width); ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 67287afa6ae5059f8af3dcdbd6910ca35db7c3c0..80e7f5c001d4b8139b538570c42fcd8d2604961b 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -19,7 +19,8 @@ register_operators(EXCLUDES fused_attention_op fused_transformer_op fused_feedforward_op - resnet_unit_op) + resnet_unit_op + fused_gemm_epilogue_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -79,4 +80,8 @@ if (WITH_GPU OR WITH_ROCM) cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() + + if (CUDA_VERSION GREATER_EQUAL 11.6) + op_library(fused_gemm_epilogue_op) + endif() endif() diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 20801d2243fb395b250f8416f1e2f5ba6a1423a4..3a2de0c4a093514a1c40321ab7dad61011709204 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -89,9 +89,9 @@ __global__ void BroadcastKernelBinary( template void LaunchBiasAddFwKernel(const platform::CUDADeviceContext& ctx, int m, int n, const T* in0, const T* in1, T* out) { - int in_vec_size = std::min(platform::GetVectorizedSize(in0), - platform::GetVectorizedSize(in1)); - int out_vec_size = std::min(4, platform::GetVectorizedSize(out)); + int in_vec_size = + std::min(phi::GetVectorizedSize(in0), phi::GetVectorizedSize(in1)); + int out_vec_size = std::min(4, phi::GetVectorizedSize(out)); int vec_size = std::min(out_vec_size, in_vec_size); int numel = m * n; @@ -191,9 +191,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num, int num_block = (max_threads / left_num); if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { - *blocking_size = phi::kernels::details::GetLastPow2(reduce_num / num_block); + *blocking_size = phi::funcs::details::GetLastPow2(reduce_num / num_block); if (*blocking_size <= 1) { - *blocking_size = phi::kernels::details::GetLastPow2(sqrt(reduce_num)); + *blocking_size = phi::funcs::details::GetLastPow2(sqrt(reduce_num)); } else if (*blocking_size * 2 < reduce_num) { *blocking_size *= 2; } diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index bb5b363fe83995faf69f61b0a1a1693ff758fa37..5dbf4fb88b2a78838ce0fe95be653f68f4805416 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/kernels/funcs/padding.h" DECLARE_int64(cudnn_exhaustive_search_times); @@ -86,7 +86,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { in_data_dims, strides, ksize); int data_dim = strides.size(); // 2d or 3d - bool is_sys_pad = math::IsSymmetricPadding(paddings, data_dim); + bool is_sys_pad = phi::funcs::IsSymmetricPadding(paddings, data_dim); Tensor transformed_input; std::vector padding_common(data_dim, 0); @@ -118,13 +118,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { T pad_value(0.0); switch (rank) { case 4: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; case 5: { - math::PadFunction( - ctx, input_pad, transformed_input_channel, pad_value, + phi::funcs::PadFunction( + dev_ctx, input_pad, transformed_input_channel, pad_value, &transformed_input); } break; default: diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 6119af18ce153ac2bcd5d45a69ab7b5d86a3cc10..b3ac3606eaf8ee843a2be98b7a237037afaf524f 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -32,7 +32,7 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 1864bdbb86667290474d297cc481f5d6352c8022..a80f590aa495db8090a30118ed4128843c0f8860 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,10 +30,10 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(conv2d); -USE_OP(conv2d_grad); -USE_OP_DEVICE_KERNEL(conv2d, CUDNN); -USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); +USE_OP_ITSELF(conv2d); +USE_OP_ITSELF(conv2d_grad); +PD_DECLARE_KERNEL(conv2d, GPUDNN, ALL_LAYOUT); +PD_DECLARE_KERNEL(conv2d_grad, GPUDNN, ALL_LAYOUT); template void InitRandomTensor(const std::vector &dims, @@ -404,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 3, output_channels = input_channels @@ -420,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, output_channels = input_channels * 4 @@ -436,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 020277675797358bf87a58ac108e6eaaddb26ccc..54e4cbdc1624921e6946210a6a192d10fcbdb7dd 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/transpose_op.cu.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { @@ -69,20 +70,21 @@ class FMHARef { ~FMHARef() {} void ComputeForward(const Tensor& qkv_input_tensor, + const Tensor* cache_kv_tensor, const Tensor* src_mask_tensor, - Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor, + Tensor* transpose_2_out_tensor, + Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor, Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor, Tensor* dropout_mask_out_tensor, Tensor* dropout_out_tensor, Tensor* qktv_out_tensor, Tensor* fmha_out_tensor) { // input shape: [bs, seq_len, 3, num_head, head_dim] - // transpose with perm [2, 0, 1, 3, 4], + // transpose with perm [2, 0, 3, 1, 4], // output_shape: [3, bs, num_head, seq_len, head_dim] int ndims = 5; std::vector perm_1 = {2, 0, 3, 1, 4}; TransposeGPUKernelDriver(dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor); - T* qkv_data = transpose_2_out_tensor->data(); T* qk_out_data = qk_out_tensor->data(); T* qktv_out_data = qktv_out_tensor->data(); @@ -90,11 +92,30 @@ class FMHARef { T* dropout_out_data = dropout_out_tensor->data(); T* fmha_out_data = fmha_out_tensor->data(); - int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; - int k_size = q_size; + auto out_seq_len = seq_len_; + if (cache_kv_tensor) { + // kv [2, bs, num_head, seq_len, head_dim] + auto kv_tensor = transpose_2_out_tensor->Slice(1, 3); + phi::funcs::ConcatFunctor concat; + // out [2, bs, num_head, cache_seq_len + seq_len, head_dim] + concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor); + out_seq_len = cache_kv_out_tensor->dims()[3]; + } + + int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; T* q_ptr = qkv_data; - T* k_ptr = q_ptr + q_size; - T* v_ptr = k_ptr + k_size; + T* k_ptr = nullptr; + T* v_ptr = nullptr; + + if (cache_kv_tensor) { + int64_t k_size = cache_kv_out_tensor->numel() / 2; + k_ptr = cache_kv_out_tensor->data(); + v_ptr = k_ptr + k_size; + } else { + int64_t k_size = q_size; + k_ptr = q_ptr + q_size; + v_ptr = k_ptr + k_size; + } // q*k^t, batched_gemm CBLAS_TRANSPOSE transA = CblasNoTrans; @@ -102,7 +123,7 @@ class FMHARef { auto blas = phi::funcs::GetBlas(dev_ctx_); int gemm_batch_size = batch_size_ * num_head_; int gemm_m = seq_len_; - int gemm_n = seq_len_; + int gemm_n = out_seq_len; int gemm_k = head_dim_; T alpha = static_cast(1.0 / sqrt(head_dim_)); T beta = static_cast(0.0); @@ -133,16 +154,16 @@ class FMHARef { transB = CblasNoTrans; gemm_m = seq_len_; gemm_n = head_dim_; - gemm_k = seq_len_; + gemm_k = out_seq_len; alpha = static_cast(1.0); stride_a = gemm_m * gemm_k; stride_b = gemm_k * gemm_n; if (dropout_param_.dropout_prob_) { DropoutFwGPUKernelDriver( - dev_ctx_, dropout_param_.is_test_, - static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + dropout_param_.is_test_, static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_, dropout_param_.is_fix_seed_, dropout_param_.seed_val_, static_cast(*softmax_out_tensor), dropout_param_.seed_, @@ -242,8 +263,9 @@ class FMHARef { // dropout bw if (dropout_param_.dropout_prob_) { DropoutGradGPUKernelDriver( - dev_ctx_, static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, static_cast(*dropout_out_grad_tensor), dropout_mask_out_tensor, softmax_out_grad_tensor->numel(), diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc index d141800d61c0ec0b73fe2cc3c8d00dbf1de44cf2..e473f8ff0662cfc3fd7bdc5010bfa1dc08fba85f 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut", "FusedAttentionOp"); + if (ctx->HasInput("CacheKV")) { + OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut", + "FusedAttentionOp"); + } if (ctx->HasInput("SrcMask")) { OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut", "FusedAttentionOp"); @@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel { "input qkv_weight = [%s]", x_dim, y_dim)); - PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], - platform::errors::InvalidArgument( - "The dimensions of qkv_weight must be 4" - "(3, num_head, dim_head, dim_embed)," - "and must satisfy the limitations: " - "(num_head * dim_head == dim_embed)")); + if (ctx->Attrs().Get("ring_id") == -1) { + PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], + platform::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4" + "(3, num_head, dim_head, dim_embed)," + "and must satisfy the limitations: " + "(num_head * dim_head == dim_embed)")); + } if (ctx->Attrs().Get("pre_layer_norm") == true) { ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]}); @@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel { // [3, batch_size, num_head, seq_len, head_size] ctx->SetOutputDim("TransposeOut2", {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); - // [batch, num_head, seq_len, seq_len] - ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + + // cache_seq_len + seq_len if cache else seq_len + auto out_seq_len = x_dim[1]; + if (ctx->HasInput("CacheKV")) { + // [2, batch_size, num_head, cache_seq_len, head_size] + auto c_dim = ctx->GetInputDim("CacheKV"); + + PADDLE_ENFORCE_EQ( + c_dim.size(), 5, + paddle::platform::errors::InvalidArgument( + "The CacheKV must be 5 dims, but got %d", c_dim.size())); + PADDLE_ENFORCE_EQ(c_dim[0], 2, + paddle::platform::errors::InvalidArgument( + "The first dim of CacheKV must be 2, but got %d", + c_dim[0])); // 2 + PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0], + paddle::platform::errors::InvalidArgument( + "The second dim of CacheKV must be equal with " + "batch size %d, but got %d", + x_dim[0], c_dim[1])); // batch_size + PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1], + paddle::platform::errors::InvalidArgument( + "The third dim of CacheKV must be equal with num " + "head %d, but got %d", + y_dim[1], c_dim[2])); // num_head + PADDLE_ENFORCE_GE( + c_dim[3], 0, + paddle::platform::errors::InvalidArgument( + "The forth dim of CacheKV must be greater than 0, but got %d", + c_dim[3])); // cache_seq_len + PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2], + paddle::platform::errors::InvalidArgument( + "The fifth dim of CacheKV must be equal with head " + "size %d, but got %d", + y_dim[2], c_dim[4])); // head_size + + out_seq_len += c_dim[3]; + // [3, batch_size, num_head, cache_seq_len + seq_len, head_size] + ctx->SetOutputDim("CacheKVOut", + {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]}); + } + + // [batch, num_head, seq_len, out_seq_len] + ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->HasInput("SrcMask")) { - ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SrcMaskOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } // the same as QKOut's shape. ctx->SetOutputDim("AttnDropoutOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->Attrs().Get("attn_dropout_is_test") == false) { ctx->SetOutputDim("AttnDropoutMaskOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } - ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SoftmaxOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); // [batch_size, num_heads, seq_len, head_dim] ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); // [batch_size, seq_len, number of heads*head size] @@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable(); AddInput("QKVW", "The qkv weight tensor."); AddInput("QKVBias", "The qkv bias tensor.").AsDispensable(); + AddInput("CacheKV", "(optional) The cached KV for generation inference.") + .AsDispensable(); AddInput("SrcMask", "(optional) The attention mask tensor in fmha.") .AsDispensable(); AddInput("OutLinearW", "The out_linear weight tensor."); @@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("BiasDropoutResidualOut", "Result of residual + dropout(src + bias).") .AsIntermediate(); + AddOutput("CacheKVOut", "The udpated cache KV."); AddOutput("Y", "Result after attention."); AddAttr("pre_layer_norm", @@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { "0.0 and 0.001, But received [%s].", ln_epsilon)); }); + AddAttr( + "ring_id", + "ring id for tensor model parallel. distributed training and inference") + .SetDefault(-1); AddComment(R"DOC( Add fused attention op whose logic is as follows: diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 03f51fc5857985902c21ad12fefbdc9cdec6ef04..d26577f06fe683fb1528c61b4401b9e578c90c9f 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -27,11 +27,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor &tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext &ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void *sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void *recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedAttentionOpKernel : public framework::OpKernel { public: @@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *src_mask = ctx.Input("SrcMask"); auto *transpose_out_2 = ctx.Output("TransposeOut2"); + auto *cache_kv = ctx.Input("CacheKV"); + auto *cache_kv_out = ctx.Output("CacheKVOut"); auto *qk_out = ctx.Output("QKOut"); auto *qktv_out = ctx.Output("QKTVOut"); auto *softmax_out = ctx.Output("SoftmaxOut"); @@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // final output. auto *out = ctx.Output("Y"); @@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel { // get data ptr for FMHA. auto *transpose_out_2_data = transpose_out_2->mutable_data(ctx.GetPlace()); + auto *cache_kv_out_data = + (cache_kv_out == nullptr) + ? nullptr + : cache_kv_out->mutable_data(ctx.GetPlace()); auto *qk_out_data = qk_out->mutable_data(ctx.GetPlace()); auto *qktv_out_data = qktv_out->mutable_data(ctx.GetPlace()); auto *src_mask_out_data = @@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel { output_size = hidden_size; // (transA, transB, compute_bias) = (false, false, false) + // NOTE(Yuang Liu): For general input size == output size, change the + // position won't have effects. For mp, the output size is mp_head * dkey + // which is actually the input size. While the input size is hidden size, + // which is actually the output size. So for out linear, switch the + // input size and output size. auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), false, false, bsz_seq, - output_size, input_size, false); + input_size, output_size, false); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel { qkv_bias_out); } if (qkv_bias == nullptr) { - fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out, + src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, + qktv_out, fmha_out); } else { - fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, + qk_out, src_mask_out, softmax_out, attn_dropout_mask_out, + attn_dropout_out, qktv_out, fmha_out); } // fmha_out: [batch_size, seq_len, num_head, head_dim] @@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel { // out_linear_out: [batch_size, seq_len, embed_dim] out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr, out_linear_out, nullptr); + // tensor model parallel + AllReduce(*out_linear_out, ring_id, ctx.cuda_device_context()); + if (pre_layer_norm) { // output = (residual + dropout(input + bias)) fused_dropout_layernorm_helper.ResidualDropoutBias( @@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // get inputs. auto *d_y = ctx.Input(framework::GradVarName("Y")); @@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel { transA = false; transB = false; bool compute_bias = false; + // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed) auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), transA, transB, bsz_seq, - output_size, input_size, compute_bias); + input_size, output_size, compute_bias); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_ln_out, ring_id, ctx.cuda_device_context()); layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data, ln_mean_data, ln_var_data, d_x_data, d_ln_scale_data, d_ln_bias_data); @@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_x, ring_id, ctx.cuda_device_context()); } // gradient accumulation std::vector ins; diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h index 994601a2f0608b4fc04966c7549c421f395f3ec7..9f5a1bad047b44b715e11e74d92fdca1982c96f8 100755 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h @@ -130,17 +130,17 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, const T factor, const int64_t size, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; LoadT src_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); - platform::Load(&src[i], &src_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); + phi::Load(&src[i], &src_vec); StoreT dx_vec; #pragma unroll @@ -148,7 +148,7 @@ __global__ void FusedDropoutActGrad(Functor act_grad, const T *dout, T tmp = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; dx_vec[ii] = tmp * act_grad.UseOut(src_vec[ii]); } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -167,9 +167,9 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, T *dx, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum if (col_id * VecSize < cols) { @@ -180,10 +180,10 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, LoadT bias_vec; MaskLoadT mask_vec; - platform::Load(&dout[index], &dout_vec); - platform::Load(&src[index], &src_vec); - platform::Load(&mask[index], &mask_vec); - platform::Load(&bias[col_id * VecSize], &bias_vec); + phi::Load(&dout[index], &dout_vec); + phi::Load(&src[index], &src_vec); + phi::Load(&mask[index], &mask_vec); + phi::Load(&bias[col_id * VecSize], &bias_vec); StoreT dx_vec; #pragma unroll @@ -194,7 +194,7 @@ __global__ void FusedDropoutActBiasGrad(Functor act_grad, const T *dout, dx_vec[i] = val; tmp_sum[i] += val; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu index 2381b5b7fdfb85cbaa3fd66a10c5b630bb515f15..717c1732b7b3acf8528887aae43471c0dc0716e3 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu @@ -20,8 +20,14 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" #include "paddle/fluid/operators/fused/fused_dropout_test.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/functors.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace details = paddle::operators::details; diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index f79277e4e8f0d22cedafc9f7b40b56ecd2d6a817..6bf3a7114f4ced3c7c6ecd1f1afeca60ff66528f 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -21,11 +21,11 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index d7952df470d81566c3833e79e8cfa31a7d2dc68c..18c7187fc8e64c9fed8a86a984954b5420c1e5b5 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -31,7 +31,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; namespace memory = paddle::memory; -USE_OP(dropout); +USE_OP_ITSELF(dropout); USE_OP(layer_norm); template diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 0c83c36b47583d49f022cb103894592d3a1b4ae7..7308f30779248e64f55e10b0661d2c98d263416c 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" #include "paddle/phi/kernels/funcs/sequence2batch.h" namespace paddle { @@ -243,12 +243,12 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ if (platform::MayIUse(platform::avx)) { \ - math::VecActivations act_functor; \ + phi::funcs::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + phi::funcs::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 0c8eae4260441f6c873b48735a01b967b70ef4bb..f3f8f1742757783a082437638f67407700963eb1 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("dropout1_seed", "Dropout1 random seed.").SetDefault(0); AddAttr("dropout2_seed", "Dropout2 random seed.").SetDefault(0); + AddAttr("ring_id", "ring id for tensor model parallel.") + .SetDefault(-1); AddComment(R"DOC( the function of fused_feedforward operator is the same as the following pseudo code: residual = src; diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 3131269955bdd17a0552836121589d8edeb4a38e..c38d9f7d4bcbd25b3111b35a918de0f4ebdabefb 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -21,11 +21,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor& tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext& ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void* sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void* recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedFeedForwardKernel : public framework::OpKernel { public: @@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor* dropout1_out, framework::Tensor* dropout2_out, const int bsz_seq, const int d_model, const int dim_feedforward, const std::string& act_method, const bool pre_layer_norm, - const float epsilon1, const float epsilon2, + const float epsilon1, const float epsilon2, const int ring_id, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const platform::CUDADeviceContext& ctx) const { @@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor linear2_out; linear2_out.mutable_data({bsz_seq, d_model}, place); MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + + // tensor model parallel + AllReduce(linear2_out, ring_id, ctx); + if (!pre_layer_norm) { fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx, linear2_out.data(), x.data(), linear2_bias_ptr, @@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance, linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model, dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2, - dropout_param1, dropout_param2, context.cuda_device_context()); + ring_id, dropout_param1, dropout_param2, context.cuda_device_context()); } }; @@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const int dim_feedforward, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const std::string& act_method, const bool pre_layer_norm, const float epsilon1, const float epsilon2, - const platform::CUDADeviceContext& ctx) const { + const int ring_id, const platform::CUDADeviceContext& ctx) const { FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); FusedDropoutHelper fused_act_dropout_helper( @@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_ln1_out.mutable_data({bsz_seq, d_model}, place); MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out, d_linear1_weight); - + // tensor model parallel + AllReduce(d_ln1_out, ring_id, ctx); pre_layernorm_helper.LayerNormGrad( ctx, d_ln1_out.data(), x.data(), ln1_gamma_ptr, ln1_mean->data(), ln1_variance->data(), d_x->data(), d_ln1_gamma_ptr, d_ln1_beta_ptr); } else { MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + // tensor model parallel + AllReduce(*d_x, ring_id, ctx); } std::vector ins(2); std::vector outs(1); @@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); const std::string act_method = context.Attr("act_method"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale, d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model, dim_feedforward, dropout_param1, dropout_param2, act_method, - pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context()); + pre_layer_norm, epsilon1, epsilon2, ring_id, + context.cuda_device_context()); } }; } // namespace operators diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4c4e3661e6d6edc5ea95b77cd283cc99afcca8ed --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc @@ -0,0 +1,353 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedGemmEpilogueOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasInput("Bias"), "Output", "Bias", + "FusedGemmEpilogueOp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FusedGemmEpilogueOp"); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto bias_dims = ctx->GetInputDim("Bias"); + + auto trans_x = ctx->Attrs().Get("trans_x"); + auto trans_y = ctx->Attrs().Get("trans_y"); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + bias_dims.size(), 1, + platform::errors::InvalidArgument( + "The Input tensor bias's dimension of FusedGemmEpilogueOp " + " should be == 1, but got %d.", + bias_dims.size())); + + PADDLE_ENFORCE_EQ(bias_dims[0], trans_y ? y_dims[0] : y_dims[1], + platform::errors::InvalidArgument( + "The Input tensor bias's dimension 0" + " should be == Y[-1], but got bias's shape = [%s] " + "and Y's shape = [%s]", + bias_dims, y_dims)); + + auto x_mat_dims = + phi::flatten_to_2d(x_dims, trans_x ? 1 : x_dims.size() - 1); + + int K_from_x = trans_x ? x_mat_dims[0] : x_mat_dims[1]; + int K_from_y = trans_y ? y_dims[1] : y_dims[0]; + + PADDLE_ENFORCE_EQ( + K_from_x, K_from_y, + platform::errors::InvalidArgument( + "The last dimension of X should be equal with Y's first dimension." + "But received X[-1] = [%d], Y[0] = [%d].", + K_from_x, K_from_y)); + + auto activation = ctx->Attrs().Get("activation"); + + if ((activation != "relu") && (activation != "gelu") && + (activation != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + + if (activation == "none" && ctx->HasOutput("ReserveSpace")) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The ReserveSpace would not be used when activation = \"none\"")); + } + + // cublasLt's restriction for auxiliary. + if (ctx->HasOutput("ReserveSpace") && activation != "none") { + int min_size_of_n = activation == "relu" ? 128 : 8; + int N_size = trans_y ? y_dims[0] : y_dims[1]; + PADDLE_ENFORCE_EQ(N_size % min_size_of_n, 0, + platform::errors::InvalidArgument( + "The output dimension N (X(MxK) * Y(KxN) = C(MxN)) " + "should be multiple of %d when auxiliary_key given " + "and activation=%s, but got N = %d.", + min_size_of_n, activation, N_size)); + } + + std::vector out_dims; + out_dims.reserve(static_cast(x_dims.size())); + if (trans_x) { + for (int i = 1; i < x_dims.size(); ++i) out_dims.push_back(x_dims[i]); + } else { + for (int i = 0; i < x_dims.size() - 1; ++i) out_dims.push_back(x_dims[i]); + } + + if (trans_y) { + out_dims.push_back(y_dims[0]); + } else { + out_dims.push_back(y_dims[1]); + } + + ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); + // Note (Ming Huang): Reserve space of relu is a bit-mask, + // which cannot pass nan_and_inf checking if shape is set. + if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) { + ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor X of Out = Act((X * Y) + Bias)."); + AddInput("Y", "The input tensor Y of Out = Act((X * Y) + Bias)."); + AddInput("Bias", "The input tensor bias of Out = Act((X * Y) + Bias)."); + + AddOutput("Out", "The output tensor Out of Out = Act((X * Y) + Bias)."); + AddOutput("ReserveSpace", + R"DOC(Reserve GPU space to place + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable() + .AsExtra(); + + AddAttr( + "trans_x", + R"DOC((bool, default false), Whether to transpose input tensor X + or not. The input tensor X coulbe be more than two dimension. When + set trans_x=true, it would fully reverse X. For instant: X with shpae + [d0, d1, d2, d3] -> [d3, d2, d1, d0].)DOC") + .SetDefault(false); + AddAttr( + "trans_y", + R"DOC((bool, default false), Whether to transpose input tensor Y + or not. The input tensor Y should be two dimension. When + set trans_y=true, it would transpose Y. For instant: Y with shpae + [d0, d1] -> [d1, d0].)DOC") + .SetDefault(false); + + AddAttr( + "activation", + R"DOC((string, default none), The activation function. It could be + one of {none, relu, gelu}. When none is given, Act would be null + operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogue Operator +This operator is used to perform Activeation(Elementwise_add(Matmul(X, Y), bias)). +It is equal to paddle.nn.Linear + Activation (None, ReLU or GeLU). + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +class FusedGemmEpilogueGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("DOut"), "Input", "DOut", + "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "FusedGemmEpilogueGradOp"); + OP_INOUT_CHECK(ctx->HasOutput("DY"), "Output", "DY", "FusedGemmEpilogueOp"); + + auto dout_dims = ctx->GetInputDim("DOut"); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_GE( + dout_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor DOut's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + dout_dims.size())); + + PADDLE_ENFORCE_EQ( + y_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor Y's dimension of FusedGemmEpilogueGradOp " + " should be 2, but got %d.", + y_dims.size())); + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::InvalidArgument( + "The Input tensor X's dimension of FusedGemmEpilogueGradOp " + " should be >= 2, but got %d.", + x_dims.size())); + + PADDLE_ENFORCE_EQ( + dout_dims.size(), x_dims.size(), + platform::errors::InvalidArgument( + "The Input tensor DOut's and X's dimension of " + "FusedGemmEpilogueGradOp " + " should be the same, but got DOut's dim = %d and X's = %d.", + dout_dims.size(), x_dims.size())); + + auto dout_mat_dims = phi::flatten_to_2d(dout_dims, dout_dims.size() - 1); + + auto x_mat_dims = phi::flatten_to_2d(x_dims, x_dims.size() - 1); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[1], y_dims[1], + platform::errors::InvalidArgument( + "The last dimension of DOut should be equal with Y's last" + "dimension. But received DOut[-1] = [%d], Y[1] = [%d].", + dout_mat_dims[1], y_dims[1])); + + PADDLE_ENFORCE_EQ( + dout_mat_dims[0], x_mat_dims[0], + platform::errors::InvalidArgument( + "The first dimension of DOut should be equal with X's first" + "dimension. But received DOut[0] = [%d], Y[0] = [%d].", + dout_mat_dims[0], x_mat_dims[0])); + + auto activation_grad = ctx->Attrs().Get("activation_grad"); + if ((activation_grad != "relu_grad") && (activation_grad != "gelu_grad") && + (activation_grad != "none")) { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation_grad)); + } + + if (activation_grad != "none" && !ctx->HasInput("ReserveSpace")) { + PADDLE_ENFORCE_EQ(true, false, + platform::errors::InvalidArgument( + "The ReserveSpace should not be empty. " + "when activation_grad == {relu_grad, gelu_grad}.")); + } + + if (ctx->HasOutput("DX")) { + std::vector dx_dims; + dx_dims.reserve(static_cast(x_dims.size())); + for (int i = 0; i < x_dims.size(); ++i) { + dx_dims.push_back(x_dims[i]); + } + ctx->SetOutputDim("DX", phi::make_ddim(dx_dims)); + } + + std::vector dy_dims(y_dims.Get(), y_dims.Get() + y_dims.size()); + ctx->SetOutputDim("DY", phi::make_ddim(dy_dims)); + + if (ctx->HasOutput("DBias")) { + std::vector dbias_dims; + dbias_dims.push_back(y_dims[1]); + ctx->SetOutputDim("DBias", phi::make_ddim(dbias_dims)); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DOut"); + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); + } +}; + +class FusedGemmEpilogueGradOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("DOut", + "The input grad tensor to Out of Out = (Act(X) * Y) + bias"); + AddInput("X", "The input tensor X of Out = (Act(X) * Y) + bias"); + AddInput("Y", "The input tensor Y of Out = (Act(X) * Y) + bias"); + AddInput("ReserveSpace", + R"DOC(A GPU space to fetch + auxiliary data pointer. It is used to pass auxiliary data pointer + for fused_gemm_epilogue_grad op. If not given (empty string), the + auxiliary mode would not be enable.)DOC") + .AsDispensable(); + + AddOutput("DX", "The output grad tensor to X of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + AddOutput("DY", + "The output grad tensor to Y of Out = (Act(X) * Y) + bias."); + AddOutput("DBias", + "The output grad tensor to bias of Out = (Act(X) * Y) + bias.") + .AsDispensable(); + + AddAttr( + "activation_grad", + R"DOC((string, default none), The backward activation function. It could be + one of {none, relu_grad, gelu_grad}. When none is given, The backward Act would + be null operations)DOC") + .SetDefault("none"); + + AddComment(R"DOC( +FusedGemmEpilogueGrad Operator +This operator is used to perform backward of Elementwise_add(Matmul(Activeation(X), Y), bias). +It is equal to Activation (None, ReLU or GeLU) + paddle.nn.Linear. + +Note: +X could be more than two dimension and would be flatten to 2D for computing. +X with shape [d0, d1, d2, d3] -> X_2D with shape [d0*d1*d2, d3] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_gemm_epilogue, ops::FusedGemmEpilogueOp, + ops::FusedGemmEpilogueOpMaker) +REGISTER_OPERATOR(fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradOp, + ops::FusedGemmEpilogueGradOpMaker) diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e16c9e8f483ccc2cbf1d7006159cccfe906dd06b --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -0,0 +1,376 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Copyright (c) 2022 NVIDIA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/dynload/cublasLt.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedGemmEpilogueKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* bias = ctx.Input("Bias"); + + Tensor* out = ctx.Output("Out"); + Tensor* reserve_space = ctx.Output("ReserveSpace"); + + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + std::string activation = ctx.Attr("activation"); + bool enable_auxiliary = reserve_space == nullptr ? false : true; + + out->mutable_data(ctx.GetPlace()); + auto* out_data = out->data(); + + auto x_mat_dims = + phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1); + int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0]; + int64_t K = trans_y ? y->dims()[1] : y->dims()[0]; + int64_t N = trans_y ? y->dims()[0] : y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtMatmulDesc_t operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &operation_desc, compute_type, scale_type)); + cublasOperation_t transx = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t transy = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &transx, + sizeof(transx))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &transy, + sizeof(transy))); + + cublasLtEpilogue_t epiloque_func = + get_epilogue_type_(activation, enable_auxiliary); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epiloque_func, + sizeof(epiloque_func))); + const T* bias_data = bias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias_data, + sizeof(bias_data))); + + if (enable_auxiliary && activation != "none") { + size_t reserve_space_size = 0; + if (activation == "relu") { + // Count in bits. + reserve_space_size = phi::product(out->dims()) / 8; + } else { + reserve_space_size = phi::product(out->dims()) * sizeof(T); + } + reserve_space->mutable_data(ctx.GetPlace(), out->type(), + reserve_space_size); + void* aux_data = reinterpret_cast(reserve_space->data()); + + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t x_desc = NULL, y_desc = NULL, out_desc = NULL; + if (trans_x) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, M, K, M)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + if (trans_y) + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, K, N, K)); + else + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &out_desc, mat_type, N, M, N)); + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + memory::allocation::AllocationPtr workspace = + memory::Alloc(dev_ctx, workspace_size); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, operation_desc, alpha, y->data(), y_desc, x->data(), + x_desc, beta, out_data, out_desc, out_data, out_desc, algo, + workspace->ptr(), workspace_size, stream)); + } + + private: + static cublasLtEpilogue_t get_epilogue_type_(const std::string& activation, + bool enable_auxiliary) { + if (activation == "relu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_RELU_AUX_BIAS + : CUBLASLT_EPILOGUE_RELU_BIAS; + } else if (activation == "gelu") { + return enable_auxiliary ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS + : CUBLASLT_EPILOGUE_GELU_BIAS; + } else if (activation == "none") { + return CUBLASLT_EPILOGUE_BIAS; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation attribute of fused_gemm_epilogue op should be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation=%s.", + activation)); + } + } +}; + +template +class FusedGemmEpilogueGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const Tensor* dout = ctx.Input("DOut"); + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* reserve_space = ctx.Input("ReserveSpace"); + + Tensor* dx = ctx.Output("DX"); + Tensor* dy = ctx.Output("DY"); + Tensor* dbias = ctx.Output("DBias"); + + std::string activation_grad = ctx.Attr("activation_grad"); + + auto dout_mat_dims = + phi::flatten_to_2d(dout->dims(), dout->dims().size() - 1); + auto x_mat_dims = phi::flatten_to_2d(x->dims(), x->dims().size() - 1); + + int64_t M = x_mat_dims[0]; + int64_t K = y->dims()[0]; + int64_t N = y->dims()[1]; + + cudaDataType_t mat_type = CUDA_R_32F; + cudaDataType_t scale_type = CUDA_R_32F; + cublasComputeType_t compute_type = CUBLAS_COMPUTE_32F; + if (std::is_same::value) { + mat_type = CUDA_R_16F; + scale_type = CUDA_R_16F; + } + if (std::is_same::value) { + mat_type = CUDA_R_64F; + scale_type = CUDA_R_64F; + compute_type = CUBLAS_COMPUTE_64F; + } + + cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle(); + size_t workspace_size = 4 * 1024 * 1024; + const cublasLtMatmulAlgo_t* algo = nullptr; + cudaStream_t stream = dev_ctx.stream(); + + double alpha64 = 1.0, beta64 = 0.0; + float alpha32 = 1.0f, beta32 = 0.0f; + void *alpha = nullptr, *beta = nullptr; + if (std::is_same::value) { + alpha = &alpha64; + beta = &beta64; + } else { + alpha = &alpha32; + beta = &beta32; + } + + cublasOperation_t trans_dout = CUBLAS_OP_N; + cublasLtMatrixLayout_t dout_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dout_desc, mat_type, N, M, N)); + + if (dx) { + cublasLtMatmulDesc_t dx_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dx_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_y = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_y, + sizeof(trans_y))); + cublasLtEpilogue_t epiloque_func_for_dx = + get_epilogue_type_(activation_grad); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dx, sizeof(epiloque_func_for_dx))); + + if (activation_grad != "none") { + auto* aux_data = reserve_space->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, + &aux_data, sizeof(aux_data))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dx_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &N, + sizeof(N))); + } + + cublasLtMatrixLayout_t y_desc = NULL, dx_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &y_desc, mat_type, N, K, N)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dx_desc, mat_type, K, M, K)); + + memory::allocation::AllocationPtr dx_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dx->mutable_data(ctx.GetPlace()); + auto* dx_data = dx->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dx_operation_desc, alpha, y->data(), y_desc, + dout->data(), dout_desc, beta, dx_data, dx_desc, dx_data, dx_desc, + algo, dx_workspace->ptr(), workspace_size, stream)); + } + + if (dy) { + cublasLtMatmulDesc_t dy_operation_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmulDescCreate( + &dy_operation_desc, compute_type, scale_type)); + cublasOperation_t trans_x = CUBLAS_OP_T; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSA, &trans_dout, + sizeof(trans_dout))); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_TRANSB, &trans_x, + sizeof(trans_x))); + cublasLtEpilogue_t epiloque_func_for_dy = dbias == nullptr + ? CUBLASLT_EPILOGUE_DEFAULT + : CUBLASLT_EPILOGUE_BGRADA; + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + &epiloque_func_for_dy, sizeof(epiloque_func_for_dy))); + + if (dbias) { + dbias->mutable_data(ctx.GetPlace()); + auto* dbias_data = dbias->data(); + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::cublasLtMatmulDescSetAttribute( + dy_operation_desc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, + &dbias_data, sizeof(dbias_data))); + } + + cublasLtMatrixLayout_t x_desc = NULL, dy_desc = NULL; + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &x_desc, mat_type, K, M, K)); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatrixLayoutCreate( + &dy_desc, mat_type, N, K, N)); + + memory::allocation::AllocationPtr dy_workspace = + memory::Alloc(dev_ctx, workspace_size); + + dy->mutable_data(ctx.GetPlace()); + auto* dy_data = dy->data(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasLtMatmul( + lt_handle, dy_operation_desc, alpha, dout->data(), dout_desc, + x->data(), x_desc, beta, dy_data, dy_desc, dy_data, dy_desc, algo, + dy_workspace->ptr(), workspace_size, stream)); + } + } + + private: + static cublasLtEpilogue_t get_epilogue_type_( + const std::string& activation_grad) { + if (activation_grad == "relu_grad") { + return CUBLASLT_EPILOGUE_DRELU; + } else if (activation_grad == "gelu_grad") { + return CUBLASLT_EPILOGUE_DGELU; + } else if (activation_grad == "none") { + return CUBLASLT_EPILOGUE_DEFAULT; + } else { + PADDLE_ENFORCE_EQ( + true, false, + platform::errors::InvalidArgument( + "The activation_grad attribute of fused_gemm_epilogue op should " + "be" + " one of {\"none\", \"relu\", \"gelu\"}. But received %s." + "But received activation_grad=%s.", + activation_grad)); + } + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDA_VERSION >= 11060 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); + +REGISTER_OP_CUDA_KERNEL( + fused_gemm_epilogue_grad, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel, + ops::FusedGemmEpilogueGradKernel); +#endif diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h index ceba3accca7727b5e4f22951d87f9e91034e3403..d53a24a57e3cc1ede127f497a9be9e3b5fa1ab0b 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h @@ -42,12 +42,12 @@ __device__ void CalcLayernormY( const LayerNormScaleBiasT *bias, const T *x, T *y, const int row_id, const int col_id, const int cols, const LayerNormParamType mean_val, const LayerNormParamType invvar) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using LoadU = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using LoadU = phi::AlignedVector; using LoadScaleOrBias = - platform::AlignedVector, - VecSize>; + phi::AlignedVector, + VecSize>; for (int i = col_id * VecSize; i < cols; i += blockDim.x * VecSize) { LoadScaleOrBias scale_vec; LoadScaleOrBias bias_vec; @@ -60,15 +60,15 @@ __device__ void CalcLayernormY( static_cast>(0); } // vectorize load data from global - platform::Load(&x[row_id * cols + i], &x_vec); + phi::Load(&x[row_id * cols + i], &x_vec); if (scale != nullptr) { - platform::Load, - VecSize>(&scale[i], &scale_vec); + phi::Load, VecSize>( + &scale[i], &scale_vec); } if (bias != nullptr) { - platform::Load, - VecSize>(&bias[i], &bias_vec); + phi::Load, VecSize>( + &bias[i], &bias_vec); } StoreT y_vec; @@ -78,7 +78,7 @@ __device__ void CalcLayernormY( (static_cast(x_vec[ii]) - mean_val) * invvar + static_cast(bias_vec[ii])); } - platform::Store(y_vec, &y[row_id * cols + i]); + phi::Store(y_vec, &y[row_id * cols + i]); } } @@ -190,9 +190,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( const ScaleT *__restrict__ beta_ptr, MaskType *__restrict__ mask_out_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ residual_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -214,8 +214,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -225,10 +225,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( Vec residual[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); - platform::Load( - residual_ptr + row * LN_NUM_COLS + col * VecSize, &residual[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); + phi::Load(residual_ptr + row * LN_NUM_COLS + col * VecSize, + &residual[it]); col += THREADS_PER_ROW; } @@ -270,9 +269,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( // store dropout_residual_out and mask_out #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store( + phi::Store( x[it], residual_out_ptr + row * LN_NUM_COLS + col * VecSize); - platform::Store( + phi::Store( mask_vec[it], mask_out_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } @@ -333,8 +332,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index cc14d0680d381ff2bbe73ee712e218c9c4d79185..032440d7f0478dc087e3ba38274f2a31a9a66a23 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif /** * @brief The unit test of fused_layernorm_residual_dropout_bias diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h index 1b135ad6098e58f457f5d21e73ac6d1a6a7c4074..1d3085a013f81ee9dca21468476df8f621bb26c2 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h @@ -32,9 +32,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( const T *__restrict__ bias, T *dst, MaskType *mask, const bool is_test, typename details::MPTypeTrait::Type *mean_val, typename details::MPTypeTrait::Type *var_val, Functor act_func) { - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskStoreT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskStoreT = phi::AlignedVector; using U = typename details::MPTypeTrait::Type; LoadT src_vec; @@ -46,14 +46,13 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( residual_vec[ii] = static_cast(0); } // vectorize load data from global - platform::Load(&src[row_id * cols + col_id], &src_vec); + phi::Load(&src[row_id * cols + col_id], &src_vec); if (residual) { - platform::Load(&residual[row_id * cols + col_id], - &residual_vec); + phi::Load(&residual[row_id * cols + col_id], &residual_vec); } if (bias) { - platform::Load(&bias[col_id], &bias_vec); + phi::Load(&bias[col_id], &bias_vec); } MaskStoreT mask_vec; @@ -89,9 +88,9 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread( } // store result to global - platform::Store(dest_vec, &dst[row_id * cols + col_id]); + phi::Store(dest_vec, &dst[row_id * cols + col_id]); if (!is_test) { - platform::Store(mask_vec, &mask[row_id * cols + col_id]); + phi::Store(mask_vec, &mask[row_id * cols + col_id]); } } @@ -176,21 +175,21 @@ __global__ void FusedResidualDropoutGrad(const T *dout, const MaskType *mask, T *dx) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) { LoadT dout_vec; MaskLoadT mask_vec; - platform::Load(&dout[i], &dout_vec); - platform::Load(&mask[i], &mask_vec); + phi::Load(&dout[i], &dout_vec); + phi::Load(&mask[i], &mask_vec); StoreT dx_vec; #pragma unroll for (int ii = 0; ii < VecSize; ii++) { dx_vec[ii] = dout_vec[ii] * static_cast(mask_vec[ii]) * factor; } - platform::Store(dx_vec, &dx[i]); + phi::Store(dx_vec, &dx[i]); } } @@ -209,9 +208,9 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, T *dbias) { int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x; - using LoadT = platform::AlignedVector; - using StoreT = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using LoadT = phi::AlignedVector; + using StoreT = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; T tmp_sum[VecSize] = {static_cast(0)}; // calculate the dx and temporary sum @@ -221,8 +220,8 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, LoadT out_vec; MaskLoadT mask_vec; StoreT dx_vec; - platform::Load(&dout[index], &out_vec); - platform::Load(&mask[index], &mask_vec); + phi::Load(&dout[index], &out_vec); + phi::Load(&mask[index], &mask_vec); #pragma unroll for (int i = 0; i < VecSize; i++) { @@ -230,7 +229,7 @@ __global__ void FusedResidualDropoutBiasGrad(const T *dout, tmp_sum[i] += out_vec[i]; } - platform::Store(dx_vec, &dx[index]); + phi::Store(dx_vec, &dx[index]); } } diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 1a12e6b565f02035b3fb9673636c2344823f288e..5dff5e2225f4f3bf3a20daa02b2b4194bd8cb99e 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif namespace framework = paddle::framework; namespace platform = paddle::platform; diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 88fb7349d538afd6d7bf4fa6947ac21307db66d8..1000d0488dc3ffcf6cde977be47ce77d2bc947a7 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -14,10 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h" #include -#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/cpu_vec.h" namespace paddle { namespace operators { @@ -196,10 +196,10 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); if (platform::MayIUse(platform::avx)) { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + phi::funcs::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc index 8da900d84f9bcedd5e4b318837fe1bb29697a6be..7d7d6ae81a0935402f94cbc16e31fbba8009ce9c 100644 --- a/paddle/fluid/operators/gather_nd_op.cc +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -12,11 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_nd_op.h" -#include -#include -#include -#include "paddle/phi/core/ddim.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -25,48 +24,10 @@ class GatherNdOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of GatherNdOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of GatherNdOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of GatherNdOp should not be null.")); - - auto x_dims = ctx->GetInputDim("X"); - auto x_dims_size = x_dims.size(); - auto index_dims = ctx->GetInputDim("Index"); - auto index_dims_size = index_dims.size(); - - PADDLE_ENFORCE_LE( - index_dims[index_dims_size - 1], x_dims_size, - platform::errors::InvalidArgument( - "Input(Index).shape[-1] should be no greater than Input(X).rank")); - PADDLE_ENFORCE_GE(index_dims_size, 1UL, - platform::errors::InvalidArgument( - "The rank of Input(Index) should be greater than 1")); - - std::vector result_dims; - // The result dims is - // Index.shape[:-1] + X.shape[Index.shape[-1]:] - for (int i = 0; i < index_dims_size - 1; ++i) { - result_dims.emplace_back(index_dims[i]); - } - for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) { - result_dims.emplace_back(x_dims[i]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(result_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); + auto* x = ctx.Input("X"); const auto& x_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); return framework::OpKernelType( x_type, @@ -80,11 +41,6 @@ class GatherNdGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -173,23 +129,17 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(GatherNdGradNoNeedBufferVarInferer, "X"); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd, GatherNdInferShapeFunctor, + PD_INFER_META(phi::GatherNdInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(gather_nd_grad, GatherNdGradInferShapeFunctor, + PD_INFER_META(phi::GatherNdGradInferMeta)); + REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker, ops::GatherNdGradOpMaker, - ops::GatherNdGradOpMaker); + ops::GatherNdGradOpMaker, + GatherNdInferShapeFunctor); REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp, - ops::GatherNdGradNoNeedBufferVarInferer); - -REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel, - ops::GatherNdOpKernel); - -REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel, - ops::GatherNdGradOpKernel); + ops::GatherNdGradNoNeedBufferVarInferer, + GatherNdGradInferShapeFunctor); diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu deleted file mode 100644 index 338c44116183415ab09881c470e6d34283b015ed..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_nd_op.cu +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/operators/gather_nd_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class GatherNdOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - const auto &index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GPUGatherNd(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherNdGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - const auto &index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::GPUScatterNdAdd(dev_ctx, *dO, *index, dX); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel, - ops::GatherNdOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(gather_nd_grad, ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel, - ops::GatherNdGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h deleted file mode 100644 index d54261008e47b89151248a8372ede4b524d999bf..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/gather_nd_op.h +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class GatherNdOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *output = ctx.Output("Out"); - - output->mutable_data(ctx.GetPlace()); - if (x->numel() == 0) return; - - auto index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::CPUGatherNd(dev_ctx, *x, *index, output); - } - } -}; - -template -class GatherNdGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto *index = ctx.Input("Index"); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dO = ctx.Input(framework::GradVarName("Out")); - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto &place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - auto index_type = index->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); - } else if (index_type == phi::DataType::INT64) { - phi::funcs::ScatterNdAdd(dev_ctx, *dO, *index, dX); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc index 995ab5d0ddf0fda19a163ec31a00a14985b5dbb9..c916f44b874a08a13fb967aae1f8b6a136023b31 100644 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ b/paddle/fluid/operators/gather_nd_op_npu.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gather_nd_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/gather_nd_op_xpu.cc b/paddle/fluid/operators/gather_nd_op_xpu.cc index 9f4c522bd145bedd09fd746781cef5efec15c139..d4cb799e825b640a2a4e0a464e18d63c5e5ed516 100644 --- a/paddle/fluid/operators/gather_nd_op_xpu.cc +++ b/paddle/fluid/operators/gather_nd_op_xpu.cc @@ -11,7 +11,10 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/gather_nd_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { @@ -20,9 +23,9 @@ template class GatherNdXPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto *x = ctx.Input("X"); - auto *index = ctx.Input("Index"); - auto *out = ctx.Output("Out"); + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *out = ctx.Output("Out"); out->template mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 8f1d9284c503813ef3dd9688891048a5bca57b29..e0db2f26d3e0534f924cc709b98689fb3f1a5cc6 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -45,6 +45,8 @@ class GatherOpCUDAKernel : public framework::OpKernel { axis = static_cast(cpu_axis.data()[0]); } else if (axis_type == framework::proto::VarType::INT64) { axis = static_cast(cpu_axis.data()[0]); + } else if (axis_type == framework::proto::VarType::INT16) { + axis = static_cast(cpu_axis.data()[0]); } } const auto &place = ctx.GetPlace(); @@ -57,6 +59,9 @@ class GatherOpCUDAKernel : public framework::OpKernel { } else if (index_type == framework::proto::VarType::INT64) { phi::funcs::GatherV2CUDAFunction(x, index, axis, output, dev_ctx); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GatherV2CUDAFunction(x, index, axis, output, + dev_ctx); } return; } @@ -67,6 +72,8 @@ class GatherOpCUDAKernel : public framework::OpKernel { phi::funcs::GPUGather(dev_ctx, *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { phi::funcs::GPUGather(dev_ctx, *x, *index, output); + } else if (index_type == framework::proto::VarType::INT16) { + phi::funcs::GPUGather(dev_ctx, *x, *index, output); } } }; @@ -134,6 +141,7 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel, ops::GatherOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index a83abb245224baf837296aa6be8f6ceb96ac700c..21093f585b59eea24a231b4dcdf264dc16178fbd 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/kron_op.h" #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" diff --git a/paddle/fluid/operators/gather_tree_op.cc b/paddle/fluid/operators/gather_tree_op.cc index 2868c3697eda19ed3e7cc1fb4c74e9beeaca9c0d..c84e94f5c71277c4fe8f25b73b266169f0d0877a 100644 --- a/paddle/fluid/operators/gather_tree_op.cc +++ b/paddle/fluid/operators/gather_tree_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,20 +24,6 @@ class GatherTreeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "GatherTree"); - OP_INOUT_CHECK(ctx->HasInput("Parents"), "Input", "Parents", "GatherTree"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GatherTree"); - - auto ids_dims = ctx->GetInputDim("Ids"); - auto parents_dims = ctx->GetInputDim("Parents"); - PADDLE_ENFORCE_EQ(ids_dims == parents_dims, true, - platform::errors::InvalidArgument( - "The shape of Input(Parents) must be same with the " - "shape of Input(Ids).")); - ctx->SetOutputDim("Out", ids_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -72,4 +61,8 @@ selected ids. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker); +DECLARE_INFER_SHAPE_FUNCTOR(gather_tree, GatherTreeInferShapeFunctor, + PD_INFER_META(phi::GatherTreeMeta)); + +REGISTER_OPERATOR(gather_tree, ops::GatherTreeOp, ops::GatherTreeOpMaker, + GatherTreeInferShapeFunctor); diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 774ff0bd065995916562061784f5218336a9da93..66eecc13d04d1aa7d4532b69f7a2fbe8c62b8e6f 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -15,38 +15,19 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/fill_constant_op.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -class CPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - - std::normal_distribution dist(mean, std); - auto shape = GetShape(context); - tensor->Resize(shape); - int64_t size = tensor->numel(); - T* data = tensor->mutable_data(context.GetPlace()); - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } - } -}; // namespace operators template class CPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { @@ -75,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom"); - - auto shape = ctx->Attrs().Get>("shape"); - std::vector temp; - temp.reserve(shape.size()); - for (auto dim : shape) { - temp.push_back(static_cast(dim)); - } - if (shape.empty() && ctx->HasInput("ShapeTensor")) { - auto shape_dims = ctx->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_dims)); - - return; - } - if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) { - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "Attribute(shape) of GaussianRandomOp must be set " - "and shape.size() > 0, but reveived shape.size() is %d", - shape.size())); - } - - ctx->SetOutputDim("Out", phi::make_ddim(temp)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -192,13 +141,20 @@ Used to initialize tensors with gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, - ops::GaussianRandomOpMaker); -REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel, - ops::CPUGaussianRandomKernel); + +DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor, + PD_INFER_META(phi::GaussianRandomInferMeta)); + +REGISTER_OPERATOR( + gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + GaussianRandomInferShapeFunctor); + REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); + REGISTER_OP_VERSION(gaussian_random) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 21d827c79200c4a368ce7677b01b18ee4ddedb8d..00ce10bfe3bccb404bce9f681ee3c7030e0fa4c4 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -19,9 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/operators/index_impl.cu.h" + +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" DECLARE_bool(use_curand); @@ -44,7 +45,8 @@ struct GaussianGenerator { thrust::minstd_rand rng; rng.seed(seed_); using MT = typename details::MPTypeTrait::Type; - thrust::normal_distribution dist(mean_, std_); + thrust::normal_distribution dist(static_cast(mean_), + static_cast(std_)); unsigned int new_n = n + offset_; rng.discard(new_n); MT out = dist(rng); @@ -52,53 +54,6 @@ struct GaussianGenerator { } }; -template -class GPUGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* tensor = context.Output("Out"); - unsigned int seed = static_cast(context.Attr("seed")); - bool seed_flag = false; - if (seed == 0) { - std::random_device rd; - seed = rd(); - seed_flag = true; - } - T mean = static_cast(context.Attr("mean")); - T std = static_cast(context.Attr("std")); - auto shape = GetShape(context); - tensor->Resize(shape); - - auto& dev_cxt = - context.template device_context(); - T* data = tensor->mutable_data(dev_cxt.GetPlace()); - - int64_t size = tensor->numel(); - - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if (gen_cuda->GetIsInitPy() && seed_flag) { - if (FLAGS_use_curand) { - using MT = typename details::MPTypeTrait::Type; - distribution::normal_distribution dist; - distribution::normal_transform trans(mean, std); - distribution::distribution_and_transform(dev_cxt, tensor, dist, - trans); - } else { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - auto func = - GaussianGenerator(mean, std, seed_offset.first, gen_offset); - IndexKernel>(dev_cxt, tensor, func); - } - } else { - auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); - } - } -}; - template class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { public: @@ -126,21 +81,16 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel { int64_t gen_offset = size * seed_offset.second; auto func = GaussianGenerator(mean, std, seed_offset.first, seed_offset.second); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } else { auto func = GaussianGenerator(mean, std, seed); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } }; } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL( - gaussian_random, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel, - paddle::operators::GPUGaussianRandomKernel); REGISTER_OP_CUDA_KERNEL( gaussian_random_batch_size_like, paddle::operators::GPUGaussianRandomBatchSizeLikeKernel< diff --git a/paddle/fluid/operators/gelu_op.cu b/paddle/fluid/operators/gelu_op.cu index 6b778eee4345170a0288bc5741c6c1078615022f..ef836ab72f001a540e081d7e9975ca5ee28758be 100644 --- a/paddle/fluid/operators/gelu_op.cu +++ b/paddle/fluid/operators/gelu_op.cu @@ -58,7 +58,7 @@ static __global__ void FP16FastGeluFwdCUDAKernel(const __half* x, __half* y, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT in_arr = *reinterpret_cast(x + offset); #pragma unroll for (int i = 0; i < VecSize; ++i) { @@ -77,7 +77,7 @@ static __global__ void FP16FastGeluBwdCUDAKernel(const __half* x, static_cast(threadIdx.x + blockIdx.x * blockDim.x) * VecSize; size_t stride = static_cast(blockDim.x * gridDim.x) * VecSize; for (; offset < n; offset += stride) { - using ArrT = platform::AlignedVector<__half, VecSize>; + using ArrT = phi::AlignedVector<__half, VecSize>; ArrT x_in_arr = *reinterpret_cast(x + offset); ArrT y_g_in_arr = *reinterpret_cast(y_g + offset); #pragma unroll @@ -103,7 +103,7 @@ static bool TryLaunchFP16FastGeluFwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_FWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(y, kAlignment)) { \ size_t thread = std::min(512, dev_ctx.GetMaxThreadsPerBlock()); \ @@ -138,7 +138,7 @@ static bool TryLaunchFP16FastGeluBwdVectorizeCUDAKernel( #define PD_LAUNCH_FP16_FAST_GELU_BWD_KERNEL(__vec_size, __use_fast_math) \ do { \ constexpr auto kAlignment = \ - alignof(platform::AlignedVector<__half, __vec_size>); \ + alignof(phi::AlignedVector<__half, __vec_size>); \ if (n % __vec_size == 0 && is_aligned(x, kAlignment) && \ is_aligned(x, kAlignment) && is_aligned(y_g, kAlignment) && \ is_aligned(x_g, kAlignment)) { \ diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index 00ff7ad2166dcf99d7b60ec45adfe70b478dedf8..f3ac53138328dbfad12c6d530a6517f40c658677 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index 6af8388d9eba4e4ea8fbb833f84a5c06e182b1f2..f7c006dbcb1a9a23ec619c8d790df8a093530eee 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/graph_send_recv_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv"); - - auto src_index_dims = ctx->GetInputDim("Src_index"); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), 1, - platform::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = ctx->GetInputDim("Dst_index"); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), 1, - platform::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ( - src_index_dims[0], dst_index_dims[0], - platform::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pool_type") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count", - "GraphSendRecv"); - ctx->SetOutputDim("Dst_count", {dims[0]}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,20 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor, + PD_INFER_META(phi::GraphSendRecvInferMeta)); REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvOpMaker, ops::GraphSendRecvGradOpMaker, - ops::GraphSendRecvGradOpMaker); + ops::GraphSendRecvGradOpMaker, + GraphSendRecvInferShapeFunctor); REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); -REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu deleted file mode 100644 index f43d31814ac38430d2d473eeca548b63e1a5c1fa..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/graph_send_recv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMaxCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMinCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); - } -}; - -template -__global__ void GraphSendRecvCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, T* output, - size_t index_size, size_t slice_size, - Functor functor) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - functor(params, output, in_i, out_i); - } -} - -// For max -template -__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { - *(output + i) = 0; - } - } -} - -// For min -template -__global__ void InputResetMinCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::max()) { - *(output + i) = 0; - } - } -} - -// Get dst_count -template -__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices, - size_t index_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { - IndexT dst_i = dst_indices[i]; - paddle::platform::CudaAtomicAdd(count + dst_i, 1); - } -} - -// For forward mean -template -__global__ void ManipulateMeanCUDAKernel(T* output, int* count, - size_t input_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - int64_t c_index = i / slice_size; - if (*(count + c_index) > 1) { - *(output + i) = *(output + i) / *(count + c_index); - } - } -} - -// For backward mean -template -__global__ void ManipulateMeanGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const int* dst_count) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); - } -} - -// For backward min and max -template -__global__ void ManipulateMinMaxGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const T* ptr_input, - const T* ptr_output) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); - } -} - -template -void GraphSendRecvOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input("X"); - auto* Y = ctx.Output("Out"); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MAX") { - GraphSendRecvMaxCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_max = - grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; - InputResetMaxCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - GraphSendRecvMinCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_min = - grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; - InputResetMinCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_dst_count, 0, input_size * sizeof(int)); -#else - cudaMemset(p_dst_count, 0, input_size * sizeof(int)); -#endif - - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(p_dst_count, d_index, index_size); - - int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_mean = - grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, p_dst_count, input_size, slice_size); - } -} - -template -void GraphSendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* Y = ctx.Output(framework::GradVarName("X")); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - ManipulateMeanGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); - const T* ptr_input = input->data(); - const T* ptr_output = output->data(); - ManipulateMinMaxGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, ptr_input, - ptr_output); - } -} - -template -class GraphSendRecvOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(graph_send_recv, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h deleted file mode 100644 index 8d8111e0ee845bf6828ee53459e6d86bdebba484..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - eigen_dst += eigen_src; - } -}; - -template -struct GraphSendRecvMinFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMin(eigen_src); - } - } -}; - -template -struct GraphSendRecvMaxFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMax(eigen_src); - } - } -}; - -template -void elementwise_inner_operation(const Tensor& src, Tensor* dst, - const IndexT& src_index, - const IndexT& dst_index, - const bool& first_flag, Functor functor) { - auto src_slice = src.Slice(src_index, src_index + 1); - auto dst_slice = dst->Slice(dst_index, dst_index + 1); - - functor(first_flag, src_slice, &dst_slice); -} - -template -void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* dst_count = nullptr) { - Functor functor; - if (pool_type == "SUM") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - for (int i = 0; i < index_size; ++i) { - IndexT dst_idx = d_index[i]; - *(dst_count + dst_idx) += 1; - } - for (int i = 0; i < input_size; ++i) { - if (*(dst_count + i) == 0) continue; - auto dst_slice = dst->Slice(i, i + 1); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(*(dst_count + i)); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } - } - } -} - -template -void graph_send_recv_cpu_for_loop_grad( - const int& input_size, const int& index_size, const IndexT* s_index, - const IndexT* d_index, const Tensor& src, Tensor* dst, - const std::string& pool_type, const int* dst_count = nullptr, - const Tensor* input = nullptr, const Tensor* output = nullptr) { - if (pool_type == "SUM") { - Functor functor; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - auto src_slice = src.Slice(src_idx, src_idx + 1); - auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - for (int i = 0; i < index_size; ++i) { - const IndexT& forward_src_idx = d_index[i]; - const IndexT& forward_dst_idx = s_index[i]; - auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); - auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); - auto eigen_input = framework::EigenVector::Flatten(input_slice); - auto eigen_output = framework::EigenVector::Flatten(output_slice); - - auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); - auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += eigen_src * (eigen_output == eigen_input); - } - } -} - -template -void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { - auto* X = ctx.Input("X"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, - p_dst_count); - } -} - -template -void GraphSendRecvGradOpKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { - const auto* input = ctx.Input("X"); - const auto* output = ctx.Input("Out"); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, - input, output); - } -} - -template -class GraphSendRecvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpKernelLaunchHelper(ctx, *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpKernelLaunchHelper(ctx, - *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpKernelLaunchHelper(ctx, - *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpKernelLaunchHelper( - ctx, *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 2d284fb516e62b08fb48ab96d2478675c495c6f6..4331523d26edc1012ff67e4a08f69d682753bb7a 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -167,9 +167,11 @@ class GroupNormGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput("Variance"), "Input", "Variance", "GroupNormGrad"); + OP_INOUT_CHECK(ctx->HasInput("Mean"), "Input", "Mean", "GroupNormGrad"); OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", framework::GradVarName("Y"), "GroupNormGrad"); @@ -216,10 +218,12 @@ class GroupNormGradMaker : public framework::SingleGradOpMaker { void Apply(GradOpPtr op) const override { op->SetType("group_norm_grad"); + op->SetInput("X", this->Input("X")); op->SetInput("Scale", this->Input("Scale")); op->SetInput("Bias", this->Input("Bias")); op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); op->SetInput("Y", this->Output("Y")); + op->SetInput("Mean", this->Output("Mean")); op->SetInput("Variance", this->Output("Variance")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index b376334f1e93cc3be9e716d808525011edb29b94..ab8c50d90b8ece68b8e4e05d46cecd13fa84d7e0 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -81,46 +81,74 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var); } -template -__device__ __forceinline__ void ThreadReduce(const T* input, int size, - const int offset, AccT* mean, - AccT* var) { +template +__device__ __forceinline__ void ThreadReduce(phi::Array arrs, + int size, const int offset, + AccT* out_mean, AccT* out_var) { + const T* x = arrs[0]; + const T* y; + if (Num == 2) { + y = arrs[1]; + } using VecT = kps::details::VectorType; int tid = threadIdx.x; if (offset > 0) { - input -= offset; + x -= offset; + if (Num == 2) { + y -= offset; + } size += offset; if (tid >= offset) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } size -= blockDim.x; - input += blockDim.x; + x += blockDim.x; + if (Num == 2) { + y += blockDim.x; + } } int remain = size % (VecSize * blockDim.x); - T ins[VecSize]; - VecT* ins_vec = reinterpret_cast(&ins); + T ins_x[VecSize]; + T ins_y[VecSize]; + VecT* ins_vec_x = reinterpret_cast(&ins_x); + VecT* ins_vec_y = reinterpret_cast(&ins_y); // vector part for (; VecSize * tid < (size - remain); tid += blockDim.x) { - *ins_vec = reinterpret_cast(input)[tid]; + *ins_vec_x = reinterpret_cast(x)[tid]; + if (Num == 2) { + *ins_vec_y = reinterpret_cast(y)[tid]; + } #pragma unroll for (int i = 0; i < VecSize; ++i) { - AccT temp = ins[i]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += ins_x[i]; + *out_var += ins_x[i] * ins_x[i]; + } else if (Num == 2) { + *out_mean += ins_y[i]; + *out_var += ins_y[i] * ins_x[i]; + } } } // scalar part tid = size - remain + threadIdx.x; for (; tid < size; tid += blockDim.x) { - AccT temp = input[tid]; - *mean += temp; - *var += temp * temp; + if (Num == 1) { + *out_mean += x[tid]; + *out_var += x[tid] * x[tid]; + } else if (Num == 2) { + *out_mean += y[tid]; + *out_var += y[tid] * x[tid]; + } } } @@ -148,7 +176,10 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, AccT x_var = static_cast(0); const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); x += i * size; - ThreadReduce(x, size, input_offset, &x_mean, &x_var); + phi::Array ins; + ins[0] = x; + ThreadReduce(ins, size, input_offset, &x_mean, &x_var); + x_mean = kps::details::BlockXReduce>( x_mean, kps::AddFunctor()); x_var = kps::details::BlockXReduce>( @@ -310,10 +341,12 @@ class GroupNormKernel }; template -__global__ void GroupNormBackwardGetMeanAndVar( - const T* x, const T* scale, const T* bias, const T* d_y, int N, int C, - int W, int imsize, int groups, int group_size, T epsilon, T* d_mean, - T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) { +__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, + const T* bias, const T* d_y, + int N, int C, int W, int imsize, + int groups, int group_size, + T epsilon, T* d_mean, T* d_var, + T* d_scale, T* d_bias) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -329,15 +362,11 @@ __global__ void GroupNormBackwardGetMeanAndVar( for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { T val, dval; - if (data_layout == DataLayout::kNCHW) { - val = x[(bid * C + ccid) * imsize + imid] - x_bias; - dval = d_y[(bid * C + ccid) * imsize + imid]; - } else { - int hid = imid / W; - int wid = imid % W; - val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; - dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - } + + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; + dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; d_var_data += val * dval; d_mean_data += dval * x_scale; @@ -357,8 +386,7 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, const T* bias, const T* var, const T* d_mean, const T* d_var, int N, int C, int W, int imsize, int groups, int group_size, - T epsilon, T* d_x, - const DataLayout data_layout) { + T epsilon, T* d_x) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; @@ -379,26 +407,142 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, if (x_scale != 0) x_scale_inv = 1.0 / x_scale; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - if (data_layout == DataLayout::kNCHW) { - T tmp = x[(bid * C + ccid) * imsize + imid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * C + ccid) * imsize + imid]; - d_x[(bid * C + ccid) * imsize + imid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); - } else { - int hid = imid / W; - int wid = imid % W; - T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; - d_x[(bid * H + hid) * W * C + wid * C + ccid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + int hid = imid / W; + int wid = imid % W; + T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; + T v_y = (tmp - x_bias) * x_scale_inv; + T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; + d_x[(bid * H + hid) * W * C + wid * C + ccid] = + x_var_inv * + (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + } +} + +template +__global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + int i = blockIdx.x; + AccT ds_sum = static_cast(0); + AccT db_sum = static_cast(0); + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); + x += i * imsize; + + phi::Array ins; + ins[0] = x; + ins[1] = dy; + ThreadReduce(ins, imsize, input_offset, &db_sum, + &ds_sum); + + ds_sum = kps::details::BlockXReduce>( + ds_sum, kps::AddFunctor()); + db_sum = kps::details::BlockXReduce>( + db_sum, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + ds[i] = ds_sum; + db[i] = db_sum; + } +} + +template +__global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, + T* ds, T* db) { + const int nc = blockIdx.x; + T ds_sum = 0; + T db_sum = 0; + for (int i = threadIdx.x; i < imsize; i += blockDim.x) { + const int index = nc * imsize + i; + ds_sum += dy[index] * x[index]; + db_sum += dy[index]; + } + CudaAtomicAddWithWarp(&ds[nc], ds_sum); + CudaAtomicAddWithWarp(&db[nc], db_sum); +} + +template +__global__ void GetScaleBiasGradientCUDAKernel(int N, int C, int group, + T epsilon, const T* mean, + const T* var, const T* ds, + const T* db, T* d_scale, + T* d_bias) { + const int c = blockIdx.x * blockDim.x + threadIdx.x; + if (c < C) { + const int G = group; + const int D = C / G; + T sum1 = 0; + T sum2 = 0; + for (int n = 0; n < N; ++n) { + const int nc = n * C + c; + const int ng = n * G + c / D; + sum1 += (d_scale == nullptr) + ? T(0) + : ((ds[nc] - db[nc] * static_cast(mean[ng])) * + static_cast(rsqrt(var[ng] + epsilon))); + sum2 += (d_bias == nullptr) ? T(0) : db[nc]; + } + if (d_scale != nullptr) { + d_scale[c] = sum1; + } + if (d_bias != nullptr) { + d_bias[c] = sum2; } } } +template +__global__ void GetBackwardParamsCUDAKernel(int imsize, int groups, + int group_size, T epsilon, + const T* mean, const T* var, + const T* scale, const T* ds, + const T* db, T* p1, T* p2, T* p3) { + const int n = blockIdx.x; + const int g = blockIdx.y; + const int ng = n * groups + g; + T sum1 = 0; + T sum2 = 0; + T var_inv = rsqrt(var[ng] + epsilon); + for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) { + const int64_t index = ng * group_size + i; + const int64_t c = g * group_size + i; + const T scale_v = scale == nullptr ? T(1) : static_cast(scale[c]); + sum1 += ds[index] * scale_v; + sum2 += db[index] * scale_v; + const T scale_c = scale == nullptr ? T(0) : static_cast(scale[c]); + p1[index] = scale_c * var_inv; + } + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum()); + sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum()); + + if (threadIdx.x == 0) { + const T s = T(1) / static_cast(group_size * imsize); + const T x = (sum2 * static_cast(mean[ng]) - sum1) * + static_cast(var_inv) * static_cast(var_inv) * + static_cast(var_inv) * s; + p2[ng] = x; + p3[ng] = -x * static_cast(mean[ng]) - sum2 * static_cast(var_inv) * s; + } +} + +template +__global__ void GetXGradientCUDAKernel(int imsize, int C, int group_size, + int groups, T* p1, T* p2, T* p3, + const T* x, const T* dy, T* dx) { + int cid = blockIdx.x; + int gid = blockIdx.y; + int bid = blockIdx.z; + int ccid = bid * C + gid * group_size + cid; + int ng = bid * groups + gid; + int nc = gid * group_size + cid; + for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { + int index = (bid * C + nc) * imsize + imid; + dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng]; + } +} + template class GroupNormGradKernel : public framework::OpKernel { @@ -408,7 +552,9 @@ class GroupNormGradKernel const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); - auto* x = ctx.Input("Y"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* mean = ctx.Input("Mean"); auto* var = ctx.Input("Variance"); auto* scale = ctx.Input("Scale"); auto* bias = ctx.Input("Bias"); @@ -433,31 +579,27 @@ class GroupNormGradKernel phi::funcs::SetConstant set_zero; auto& dev_ctx = ctx.template device_context(); - Tensor temp_var; - temp_var.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_var, static_cast(0)); - T* temp_var_data = temp_var.data(); - - Tensor temp_mean; - temp_mean.mutable_data(var->dims(), ctx.GetPlace()); - set_zero(dev_ctx, &temp_mean, static_cast(0)); - T* temp_mean_data = temp_mean.data(); + Tensor ds, db; + ds.mutable_data({x_dims[0], C}, ctx.GetPlace()); + db.mutable_data({x_dims[0], C}, ctx.GetPlace()); + T* ds_data = ds.data(); + T* db_data = db.data(); + auto* y_data = y->data(); auto* x_data = x->data(); T* d_x_data = nullptr; if (d_x) d_x_data = d_x->data(); - auto* y_data = d_y->data(); + auto* dy_data = d_y->data(); auto* var_data = var->data(); + auto* mean_data = mean->data(); T* d_scale_data = nullptr; if (d_scale) { d_scale->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_scale, static_cast(0)); d_scale_data = d_scale->data(); } T* d_bias_data = nullptr; if (d_bias) { d_bias->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_bias, static_cast(0)); d_bias_data = d_bias->data(); } @@ -479,22 +621,103 @@ class GroupNormGradKernel #ifdef __HIPCC__ int block_size = std::max(std::min(256, imsize), 64); + const int block_dims = 256; #else int block_size = std::min(1024, imsize); + const int block_dims = 1024; #endif dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; - UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data, - bias_data, y_data, x_dims[0], C, W, imsize, groups, - group_size, epsilon, temp_mean_data, temp_var_data, - d_scale_data, d_bias_data, data_layout); - if (d_x_data != nullptr) { - UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data, - bias_data, var_data, temp_mean_data, temp_var_data, - x_dims[0], C, W, imsize, groups, group_size, epsilon, - d_x_data, data_layout); + if (data_layout == DataLayout::kNCHW) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + const int max_num_threads = 1024; + int max_block_size = std::min(imsize / vec_size, max_num_threads); + int block_size_nchw = 1; + while (block_size_nchw < max_block_size) { + block_size_nchw *= 2; + } + block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); + dim3 blocks(block_size_nchw); + if (imsize < vec_size) { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + ScalarGetDsDbCUDAKernel< + T><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } else { + VectorizedGetDsDbCUDAKernel< + T, AccT, vec_size><<>>( + imsize, x_data, dy_data, ds_data, db_data); + } + + if (d_scale || d_bias) { + const int block = 256; + GetScaleBiasGradientCUDAKernel< + T><<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>( + x_dims[0], C, groups, epsilon, mean_data, var_data, ds_data, + db_data, d_scale_data, d_bias_data); + } + + if (d_x_data != nullptr) { + // p1 * dy + p2 * x + p3, + // p1, p2, p3 represent the reverse calculation of temporary variables + // p1 = scale * var_inv + // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n) + // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n); + Tensor p1, p2, p3; + p1.mutable_data({x_dims[0] * C}, ctx.GetPlace()); + p2.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + p3.mutable_data({x_dims[0], groups}, ctx.GetPlace()); + T* p1_data = p1.data(); + T* p2_data = p2.data(); + T* p3_data = p3.data(); + + GetBackwardParamsCUDAKernel<<< + dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>( + imsize, groups, group_size, epsilon, mean_data, var_data, + scale_data, ds_data, db_data, p1_data, p2_data, p3_data); + GetXGradientCUDAKernel<<>>( + imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data, + dy_data, d_x_data); + } + + } else { + if (d_scale) { + set_zero(dev_ctx, d_scale, static_cast(0)); + } + if (d_bias) { + set_zero(dev_ctx, d_bias, static_cast(0)); + } + + Tensor temp_var; + temp_var.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_var, static_cast(0)); + T* temp_var_data = temp_var.data(); + + Tensor temp_mean; + temp_mean.mutable_data(var->dims(), ctx.GetPlace()); + set_zero(dev_ctx, &temp_mean, static_cast(0)); + T* temp_mean_data = temp_mean.data(); + + int flags = (scale_data != nullptr) * kHasScale + + (bias_data != nullptr) * kHasBias; + UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, y_data, + scale_data, bias_data, dy_data, x_dims[0], C, W, imsize, + groups, group_size, epsilon, temp_mean_data, + temp_var_data, d_scale_data, d_bias_data); + if (d_x_data != nullptr) { + UNROLL_ALL_CASES(flags, GroupNormBackward, y_data, dy_data, scale_data, + bias_data, var_data, temp_mean_data, temp_var_data, + x_dims[0], C, W, imsize, groups, group_size, epsilon, + d_x_data); + } } } }; diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc index f8f8f3fd789ad61a99bcc17bc073b6cfd099f639..524f2d6c9d719468876d8a586b6eea13f99a7b79 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cc +++ b/paddle/fluid/operators/gumbel_softmax_op.cc @@ -90,11 +90,11 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxInferMeta)); -DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, + PD_INFER_META(phi::GumbelSoftmaxInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, GumbelSoftmaxGradInferShapeFunctor, - PT_INFER_META(phi::GumbelSoftmaxGradInferMeta)); + PD_INFER_META(phi::GumbelSoftmaxGradInferMeta)); REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp, ops::GumbelSoftmaxOpMaker, diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 3915ce5809c394738c58e80accccac531c268c23..3c9bbc753f29b1cf104a085d340ddc75cf2790f8 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -112,8 +112,8 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, - PT_INFER_META(phi::HuberLossInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor, + PD_INFER_META(phi::HuberLossInferMeta)); REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, ops::HuberLossGradOpMaker, diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 567a69f383d1cc20cc1fc5b8c0a5c6f8368824af..16968876ac96cac2fa1b009ea40b939f1e11a953 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(ImagGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, - PT_INFER_META(phi::RealAndImagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(imag, ImagInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 105d818e197434c4ed85126228e06d45bf06e498..e2efaa1759b008dd0055bb6e06917cbd4fc1932f 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -87,8 +87,8 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, - PT_INFER_META(phi::IncrementInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, + PD_INFER_META(phi::IncrementInferMeta)); REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, ops::IncrementGradOpMaker, ops::IncrementGradOpMaker, diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index 09f4e63943ad3784a598524273831bf875ed9213..8324a6215bca8145ba36dabb3d8108006a57e829 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h index 2e3e6569ef5a88f8dfcb6646974b70bcc6c0c95f..bb26e2f445e7034b8f982594216eacfd3007a24f 100644 --- a/paddle/fluid/operators/index_impl.cu.h +++ b/paddle/fluid/operators/index_impl.cu.h @@ -19,11 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/operators/fill_constant_op.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/primitive/kernel_primitives.h" namespace paddle { @@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { int numel = out->numel(); T *out_data = out->mutable_data(dev_ctx.GetPlace()); if (numel <= 0) return; - int vec_size = paddle::platform::GetVectorizedSize(out_data); + int vec_size = phi::GetVectorizedSize(out_data); #ifdef PADDLE_WITH_XPU_KP int block = 64; int grid = 8; diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc index 68d002fceea70fd032d7613802d095770d3d4754..d17c6368c7537b93ceb6f1d75b6d73467bd207ac 100644 --- a/paddle/fluid/operators/index_sample_op.cc +++ b/paddle/fluid/operators/index_sample_op.cc @@ -100,8 +100,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, - PT_INFER_META(phi::IndexSampleInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor, + PD_INFER_META(phi::IndexSampleInferMeta)); REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker, ops::IndexSampleGradMaker, ops::IndexSampleGradMaker, diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index e0779249c41adc5005bbaba6e19127d2ced3a9ec..7f5136969980b887bb7bbe013690898e66abeac1 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -17,6 +17,8 @@ #include #include #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { @@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker { }; template -class InplaceABNKernel - : public paddle::operators::BatchNormKernel { +class InplaceABNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); @@ -213,7 +214,33 @@ class InplaceABNKernel auto activation = GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); - BatchNormKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); auto cur_y = EigenVector::Flatten(*y); InplaceABNActivation functor; @@ -222,8 +249,7 @@ class InplaceABNKernel }; template -class InplaceABNGradKernel - : public paddle::operators::BatchNormGradKernel { +class InplaceABNGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Input("Y"); @@ -244,7 +270,52 @@ class InplaceABNGradKernel InplaceABNActivation functor; functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); - BatchNormGradKernel::Compute(ctx); + // BatchNormGradKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } }; diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index be7a7bd71711e379ef4d98eb1f9ac5ee2caaace6..db8f8c72d13f8e46f6f9e332c5c2f5164b6d0836 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -15,14 +15,15 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { template class InplaceABNKernel - : public paddle::operators::SyncBatchNormKernel, - public paddle::operators::BatchNormKernel { + : public paddle::operators::SyncBatchNormKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Output("Y"); @@ -36,7 +37,33 @@ class InplaceABNKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormKernel::Compute(ctx); } else { - BatchNormKernel::Compute(ctx); + // BatchNormKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); } auto cur_y = EigenVector::Flatten(*y); @@ -49,8 +76,7 @@ class InplaceABNKernel // https://kevinzakka.github.io/2016/09/14/batch_normalization/ template class InplaceABNGradKernel - : public paddle::operators::SyncBatchNormGradKernel, - public paddle::operators::BatchNormGradKernel { + : public paddle::operators::SyncBatchNormGradKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* y = ctx.Input("Y"); @@ -74,7 +100,50 @@ class InplaceABNGradKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormGradKernel::Compute(ctx); } else { - BatchNormGradKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } } }; diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index d61eb46d97e98972963f5871a4c6e7b06468337c..cd297c53f89a0f7efc622de7c385b9f75dc7462b 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -61,13 +61,13 @@ inline platform::GpuLaunchConfig GetGpuLaunchConfig3D( template __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex( - int* in_img_idx, int* w_id, T* w1lambda, T* w2lambda, T src_w, - const int in_img_w) { - src_w = (src_w > 0) ? src_w : 0.f; - *in_img_idx = static_cast(src_w); - *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0; - *w1lambda = src_w - *in_img_idx; - *w2lambda = 1.f - *w1lambda; + int* in_img_idx, int* x_id, T* lambda1, T* lambda2, T src_x, + const int in_img_x) { + src_x = (src_x > 0) ? src_x : 0.f; + *in_img_idx = static_cast(src_x); + *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0; + *lambda1 = src_x - *in_img_idx; + *lambda2 = 1.f - *lambda1; } struct FastDivModForInterpolate { @@ -670,83 +670,102 @@ __global__ void KeBilinearInterpBwShareMemory( } } +__device__ __forceinline__ int GetInputIndex(const size_t nc, const int height, + const int width, const int h, + const int w) { + return (nc * height + h) * width + w; +} + +template +__global__ void KeBilinearInterpNCHWBw(T* in, const int in_h, const int in_w, + const int out_h, const int out_w, + const int n, const int num_channels, + float ratio_h, float ratio_w, + const T* __restrict__ out, + const T align_type_value) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + int stride = blockDim.x * gridDim.x; + int num_out = n * num_channels * out_h * out_w; + int num_in = n * num_channels * in_h * in_w; + + for (; index < num_out; index += stride) { + int index_tmp = index; + int w2 = index_tmp % out_w; + index_tmp /= out_w; + int h2 = index_tmp % out_h; + int nc = index_tmp / out_h; + + int h1, y_id; + T h1lambda, h0lambda; + T src_y = ratio_h * (h2 + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&h1, &y_id, &h1lambda, &h0lambda, + src_y, in_h); + int w1, x_id; + T w1lambda, w0lambda; + T src_x = ratio_w * (w2 + align_type_value) - align_type_value; + PreCalculatorForLinearInterpInputIndex(&w1, &x_id, &w1lambda, &w0lambda, + src_x, in_w); + + T d2val = out[index]; + + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1), + h0lambda * w0lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1, w1 + x_id), + h0lambda * w1lambda * d2val); + platform::CudaAtomicAdd(in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1), + h1lambda * w0lambda * d2val); + platform::CudaAtomicAdd( + in + GetInputIndex(nc, in_h, in_w, h1 + y_id, w1 + x_id), + h1lambda * w1lambda * d2val); + } +} + template __global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, const T* __restrict__ out, const int out_h, const int out_w, const int n, - const int num_channels, float ratio_h, - float ratio_w, const T align_type_value, - bool is_nchw) { + const int out_chw, const int num_channels, + float ratio_h, float ratio_w, + const T align_type_value, + FastDivModForInterpolate divmods) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; int in_chw = in_h * in_w * num_channels; - int out_chw = num_channels * out_h * out_w; int nthreads = n * out_chw; - if (is_nchw) { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_w; - int out_img_idx = tid % out_w; - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size + - in_img_idy * in_w + in_img_idx]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id], - h1lambda * w1lambda * value); - } - } else { - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / out_chw; - int out_id_w = tid % out_chw; - const int in_img_size = in_h * in_w; - const int out_img_size = out_h * out_w; - T value = out[out_id_h * out_chw + out_id_w]; - - int out_img_idy = out_id_w / (out_w * num_channels); - int out_img_idx = out_id_w % (out_w * num_channels) / num_channels; - int channel_id = tid % num_channels; - - int in_img_idx, in_img_idy, w_id, h_id; - T w1lambda, h1lambda, w2lambda, h2lambda; - T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; - T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; - - PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, - &w2lambda, src_w, in_w); - PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, - &h2lambda, src_h, in_h); - - T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + - in_img_idx * num_channels + channel_id]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * value); - platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], - h1lambda * w2lambda * value); - platform::CudaAtomicAdd( - &in_pos[h_id * in_w * num_channels + w_id * num_channels], - h1lambda * w1lambda * value); - } + for (; tid < nthreads; tid += stride) { + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; + + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + + PreCalculatorForLinearInterpInputIndex(&in_img_idx, &w_id, &w1lambda, + &w2lambda, src_w, in_w); + PreCalculatorForLinearInterpInputIndex(&in_img_idy, &h_id, &h1lambda, + &h2lambda, src_h, in_h); + + T value = out[tid]; + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + + in_img_idx * num_channels + channel_id]; + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * value); + platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); + platform::CudaAtomicAdd( + &in_pos[h_id * in_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * value); } } @@ -1907,11 +1926,23 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ctx.cuda_device_context().stream()>>>( input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, ratio_h, ratio_w, align_type_value, is_nchw); + } else if (!optimize_flag & is_nchw) { + // + const int num_kernels = n * c * out_h * out_w; + const int num_threads = + std::min(ctx.cuda_device_context().GetMaxThreadsPerBlock(), 1024); + KeBilinearInterpNCHWBw< + T><<>>( + input_grad_data, in_h, in_w, out_h, out_w, n, c, ratio_h, ratio_w, + output_grad_data, align_type_value); } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); KeBilinearInterpBw<<>>( - input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, - ratio_h, ratio_w, align_type_value, is_nchw); + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_type_value, interp_divmods); } } else if ("bicubic" == interp_method) { #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h index 1e061d8b50ae02f9b87f0a0976543467aa0b7dd0..31c22915ec5d052eb11c613d476f6aea541d8c47 100644 --- a/paddle/fluid/operators/inverse_op.h +++ b/paddle/fluid/operators/inverse_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, output); } }; diff --git a/paddle/fluid/operators/is_empty_op.cc b/paddle/fluid/operators/is_empty_op.cc index 2750367dc773925e998507db4690e39c15f985d0..c835bb3cf60bfbf71b585828c74ac45f6bc91f8b 100644 --- a/paddle/fluid/operators/is_empty_op.cc +++ b/paddle/fluid/operators/is_empty_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/is_empty_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -24,12 +26,6 @@ class IsEmptyOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "IsEmpty"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "IsEmpty"); - ctx->SetOutputDim("Out", {1}); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto *x = ctx.Input("X"); @@ -56,12 +52,10 @@ It will just return product(tensor.ddims()) > 0; } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(is_empty, IsEmptyInferShapeFunctor, + PD_INFER_META(phi::IsEmptyInferMeta)); REGISTER_OPERATOR( is_empty, ops::IsEmptyOp, ops::IsEmptyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); + paddle::framework::EmptyGradOpMaker, + IsEmptyInferShapeFunctor); diff --git a/paddle/fluid/operators/is_empty_op.cu.cc b/paddle/fluid/operators/is_empty_op.cu.cc deleted file mode 100644 index 3c256503baf6ba3bc8f8dff866a2ce9c57ec5bf1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/is_empty_op.cu.cc +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/is_empty_op.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - is_empty, ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel, - ops::IsEmptyOpKernel); diff --git a/paddle/fluid/operators/isclose_op.cc b/paddle/fluid/operators/isclose_op.cc index 0ae7a9fa02f1fb217555ae41d8b25cbba0e43d19..8668de4d3a6288841ad191f3e47b87a76eeb1d63 100644 --- a/paddle/fluid/operators/isclose_op.cc +++ b/paddle/fluid/operators/isclose_op.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isclose_op.h" #include #include #include "paddle/fluid/framework/op_registry.h" @@ -23,45 +22,6 @@ namespace paddle { namespace operators { -template -struct GetTensorValue { - T operator()(const platform::CPUDeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - return *(tensor.data()); - } -}; - -template -struct IscloseFunctor { - void operator()(const platform::CPUDeviceContext& ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - auto* in_a = in.data(); - auto* in_b = other.data(); - auto* out_data = output->mutable_data(ctx.GetPlace()); - auto num = in.numel(); - // *out_data = true; - for (int i = 0; i < num; i++) { - out_data[i] = true; - } - for (int i = 0; i < num; i++) { - const T a = in_a[i], b = in_b[i]; - bool val; - if (std::isnan(a) || std::isnan(b)) { - val = equal_nan && std::isnan(a) == std::isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - // *out_data &= val; - out_data[i] = val; - } - } -}; - class IscloseOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -154,12 +114,9 @@ class IscloseOpVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR( isclose, ops::IscloseOp, ops::IscloseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, ops::IscloseOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); diff --git a/paddle/fluid/operators/isclose_op.cu b/paddle/fluid/operators/isclose_op.cu deleted file mode 100644 index 09710ba0c6957d39318abfc24113d4b9db11622d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/isclose_op.cu +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/isclose_op.h" - -namespace paddle { -namespace operators { - -template -struct GetTensorValue { - T operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& tensor) const { - const T* data = tensor.data(); - T value; - const auto gpu_place = dev_ctx.GetPlace(); - memory::Copy(platform::CPUPlace(), &value, gpu_place, data, sizeof(T), - dev_ctx.stream()); - return value; - } -}; - -template -__global__ void IscloseCUDAKernel(const T* in_data, const T* other_data, - const double rtol, const double atol, - bool equal_nan, int num, bool* out_data) { - unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; - bool val; - for (int i = idx; i < num; i += blockDim.x * gridDim.x) { - const T a = in_data[i], b = other_data[i]; - if (isnan(a) || isnan(b)) { - val = equal_nan && isnan(a) == isnan(b); - } else { - T left = (a > b ? a - b : b - a); - T right = atol + (b > 0 ? rtol * b : (-rtol) * b); - T diff = (left > right ? left - right : right - left); - val = a == b || left <= right || diff <= 1e-15; - } - out_data[i] = val; - // if (!val) *out_data = false; - } -} - -template -struct IscloseFunctor { - void operator()(const platform::CUDADeviceContext& dev_ctx, - const framework::Tensor& in, const framework::Tensor& other, - const double rtol, const double atol, bool equal_nan, - framework::Tensor* output) { - int num = in.numel(); - const T* in_data = in.data(); - const T* other_data = other.data(); - bool* out_data = output->mutable_data(dev_ctx.GetPlace()); - int block = 1024; - int grid = (block - 1 + num) / block; - grid = (grid > block) ? block : grid; -#ifdef PADDLE_WITH_HIP - hipMemset(out_data, true, num * sizeof(bool)); -#else - cudaMemset(out_data, true, num * sizeof(bool)); -#endif - IscloseCUDAKernel<<>>( - in_data, other_data, rtol, atol, equal_nan, num, out_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(isclose, ops::IscloseKernel, - ops::IscloseKernel); diff --git a/paddle/fluid/operators/isclose_op.h b/paddle/fluid/operators/isclose_op.h deleted file mode 100644 index cde5d2afbf009a16a3d0c3601697703d8ec8eb7d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/isclose_op.h +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -using Tensor = framework::Tensor; - -template -struct GetTensorValue { - T operator()(const platform::DeviceContext& ctx, - const framework::Tensor& tensor) const; -}; - -template -struct IscloseFunctor { - void operator()(const DeviceContext& ctx, const framework::Tensor& in, - const framework::Tensor& other, const float rtol, - const float atol, bool equal_nan, framework::Tensor* output); -}; - -template -class IscloseKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // get attrs - bool equal_nan = ctx.Attr("equal_nan"); - // get input/output - const auto* input = ctx.Input("Input"); - const auto* other = ctx.Input("Other"); - auto* out = ctx.Output("Out"); - - double rtol_v = std::stod(ctx.Attr("rtol")); - double atol_v = std::stod(ctx.Attr("atol")); - - auto& dev_ctx = ctx.template device_context(); - GetTensorValue get_tensor_value; - if (ctx.HasInput("Rtol")) { - const auto* rtol = ctx.Input("Rtol"); - PADDLE_ENFORCE_EQ( - rtol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Rtol) size must be 1, but get %d.", rtol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(rtol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Rtol) type must be double, but get %s.", - framework::DataTypeToString( - framework::TransToProtoVarType(rtol->dtype())))); - rtol_v = get_tensor_value(dev_ctx, *rtol); - } - if (ctx.HasInput("Atol")) { - const auto* atol = ctx.Input("Atol"); - PADDLE_ENFORCE_EQ( - atol->numel(), 1, - platform::errors::InvalidArgument( - "Input(Atol) size must be 1, but get %d", atol->numel())); - PADDLE_ENFORCE_EQ( - framework::TransToProtoVarType(atol->dtype()), - framework::proto::VarType::FP64, - platform::errors::InvalidArgument( - "Input(Atol) type must be double, but get %s", - framework::DataTypeToString( - framework::TransToProtoVarType(atol->dtype())))); - atol_v = get_tensor_value(dev_ctx, *atol); - } - - IscloseFunctor()(dev_ctx, *input, *other, rtol_v, atol_v, - equal_nan, out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 735fffa7203b1213fccec0c4098048e85a6b24f8..cfa370ff9cb19dfb7d488b03cba52c115083cdc8 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isfinite_v2_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2"); - UnaryOpUnchangedInferShape(ctx); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -104,6 +101,14 @@ element of X as a tensor. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); #define REGISTER_V2OP_MAKER(op_type, comment) \ namespace paddle { \ @@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)"); REGISTER_OPERATOR( isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsinfInferShapeFunctor); REGISTER_OPERATOR( isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsnanInferShapeFunctor); REGISTER_OPERATOR( isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); + paddle::framework::EmptyGradOpMaker, + IsfiniteInferShapeFunctor); diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu deleted file mode 100644 index 1b9f19d36dfa0f590f96577295ffb12e4456d2e5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/isfinite_v2_op.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/isfinite_v2_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index a78d8ec10149db5a1f8d585cb06bb08ea6ca5a5f..dcd98054b05c314da0884e8dc6be358d3afb0483 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -9,7 +9,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" #include #include #include "paddle/fluid/framework/op_registry.h" @@ -177,10 +176,3 @@ REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, ops::KLDivLossOpGradMaker); REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, ops::KLDivLossGradNoNeedBufferVarInferer); -REGISTER_OP_CPU_KERNEL( - kldiv_loss, ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CPU_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.cu b/paddle/fluid/operators/kldiv_loss_op.cu deleted file mode 100644 index 5226cb8c08e3db4a0bfbbe4440c27264903f06e3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kldiv_loss_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - kldiv_loss, - ops::KLDivLossKernel, - ops::KLDivLossKernel); -REGISTER_OP_CUDA_KERNEL( - kldiv_loss_grad, - ops::KLDivLossGradKernel, - ops::KLDivLossGradKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.h b/paddle/fluid/operators/kldiv_loss_op.h deleted file mode 100644 index 5a6ef06f5eb1e855c8a528664528c9919304c7b9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kldiv_loss_op.h +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using Array1 = Eigen::DSizes; - -template -struct KLDivLossForward { - HOSTDEVICE KLDivLossForward() {} - - HOSTDEVICE T operator()(const T& target, const T& input) const { - if (target <= 0) { - return 0; - } else { - return target * (std::log(target) - input); - } - } -}; - -template -struct KLDivLossBackward { - HOSTDEVICE KLDivLossBackward() {} - - HOSTDEVICE T operator()(const T& target, const T& grad) const { - if (target <= 0) { - return 0; - } else { - return static_cast(-1.) * grad; - } - } -}; - -template -class KLDivLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* input = ctx.Input("X"); - auto* target = ctx.Input("Target"); - auto* loss = ctx.Output("Loss"); - auto reduction = ctx.Attr("reduction"); - - const int n = input->dims()[0]; - - loss->mutable_data(ctx.GetPlace()); - auto input_t = framework::EigenVector::Flatten(*input); - auto target_t = framework::EigenVector::Flatten(*target); - auto loss_t = framework::EigenVector::Flatten(*loss); - auto output = target_t.binaryExpr(input_t, KLDivLossForward()); - if ("none" == reduction) { - loss_t.device(place) = output; - } else if ("batchmean" == reduction) { - auto output_sum = output.sum(); - if (n > 0) { - loss_t.device(place) = output_sum / output_sum.constant(n); - } else { - loss_t.device(place) = output_sum; - } - } else if ("mean" == reduction) { - loss_t.device(place) = output.mean(); - } else if ("sum" == reduction) { - loss_t.device(place) = output.sum(); - } - } -}; - -template -class KLDivLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& place = *ctx.template device_context().eigen_device(); - auto* target = ctx.Input("Target"); - auto reduction = ctx.Attr("reduction"); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); - - const int n = input_grad->dims()[0]; - const int numel = input_grad->numel(); - const int expand = numel / loss_grad->numel(); - - input_grad->mutable_data(ctx.GetPlace()); - - auto target_t = framework::EigenVector::Flatten(*target); - - auto input_grad_t = framework::EigenVector::Flatten(*input_grad); - auto loss_grad_t = framework::EigenVector::Flatten(*loss_grad); - - auto loss_grad_expand = loss_grad_t.broadcast(Array1(expand)); - auto grad_t = target_t * loss_grad_expand; - input_grad_t.device(place) = - target_t.binaryExpr(grad_t, KLDivLossBackward()); - - if ("mean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(numel); - } else if ("batchmean" == reduction) { - input_grad_t.device(place) = input_grad_t / static_cast(n); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc index 322ae5df4cb877b4dde022e6c203a32cd8dd001d..eac181489aa9d09f4661c898b13e77570ad928a8 100644 --- a/paddle/fluid/operators/kldiv_loss_op_npu.cc +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the Licnse. */ -#include "paddle/fluid/operators/kldiv_loss_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc index 58d51ab1c723f296d3728a23de95a116acbb4df3..68d0c7978b4e45f216abd5fa5c4be93f788e8f04 100644 --- a/paddle/fluid/operators/kron_op.cc +++ b/paddle/fluid/operators/kron_op.cc @@ -17,9 +17,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -178,27 +176,4 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(kron, ops::KronOp, ops::KronOpMaker, ops::KronGradOpMaker, ops::KronGradOpMaker); -REGISTER_OP_CPU_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - REGISTER_OPERATOR(kron_grad, ops::KronGradOp); -REGISTER_OP_CPU_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu deleted file mode 100644 index e5124e65007509568ae8cd8ab65b33c504a12fe9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kron_op.cu +++ /dev/null @@ -1,42 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - kron, ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel, - ops::KronKernel>, - ops::KronKernel>); - -REGISTER_OP_CUDA_KERNEL( - kron_grad, ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel, - ops::KronGradKernel>, - ops::KronGradKernel>); diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h deleted file mode 100644 index 274b47c03a4d3d381dceda43d502a6e2d14669a5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/kron_op.h +++ /dev/null @@ -1,415 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "thrust/device_vector.h" -#endif - -namespace paddle { -namespace operators { - -// Process an element in the output, used with a parallel-for -template -struct KronElemFunctor { - KronElemFunctor(const T* a, const T* b, T* out, const int64_t* shape_b, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* stride_out, int ndims) - : a_(a), - b_(b), - out_(out), - shape_b_(shape_b), - stride_a_(stride_a), - stride_b_(stride_b), - stride_out_(stride_out), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) const { - // it computes 1 element in the output - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_out_[i]; - index = index % stride_out_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - out_[idx] = a_[index_a] * b_[index_b]; - } - - private: - const T* a_; - const T* b_; - T* out_; - const int64_t* shape_b_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* stride_out_; - const int ndims_; -}; - -template -struct KronOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& x, - const framework::Tensor& y, framework::Tensor* out) { - int ndims = out->dims().size(); - int64_t numel = out->numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_out = out->dims(); - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_out = phi::stride(dim_out); - - const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr, - *p_stride_out = nullptr, *p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_out(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_out.Get(), stride_out.Get() + ndims, - d_stride_out.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_out = thrust::raw_pointer_cast(d_stride_out.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_out = stride_out.Get(); - p_shape_y = dim_y.Get(); -#endif - - platform::ForRange for_range(dev_ctx, numel); - KronElemFunctor functor(x.data(), y.data(), out->data(), - p_shape_y, p_stride_x, p_stride_y, p_stride_out, - ndims); - for_range(functor); - } -}; - -template -struct KronGradElemFunctor { - KronGradElemFunctor(const T* dout, const T* A, const T* B, T* dout_a, - T* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = dout_[idx] * B_[index_b]; - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = dout_[idx] * A_[index_a]; - } - } - - private: - const T* dout_; - const T* A_; - const T* B_; - T* dout_a_; - T* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradElemFunctor> { - KronGradElemFunctor(const platform::complex* dout, - const platform::complex* A, - const platform::complex* B, - platform::complex* dout_a, - platform::complex* dout_b, const int64_t* stride_dout, - const int64_t* stride_a, const int64_t* stride_b, - const int64_t* shape_b, const int64_t numel_a, - const int64_t numel_b, const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = - dout_[idx] * - platform::complex(B_[index_b].real, -B_[index_b].imag); - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = - dout_[idx] * - platform::complex(A_[index_a].real, -A_[index_a].imag); - } - } - - private: - const platform::complex* dout_; - const platform::complex* A_; - const platform::complex* B_; - platform::complex* dout_a_; - platform::complex* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template -struct KronGradOpFunctor { - void operator()(const DeviceContext& dev_ctx, const framework::Tensor& dout, - const framework::Tensor& x, const framework::Tensor& y, - framework::Tensor* dx, framework::Tensor* dy) { - int ndims = dout.dims().size(); - int64_t numel = dout.numel(); - int64_t numel_x = x.numel(); - int64_t numel_y = y.numel(); - - const framework::DDim& dim_x = x.dims(); - const framework::DDim& dim_y = y.dims(); - const framework::DDim& dim_dout = dout.dims(); - - const framework::DDim stride_x = phi::stride(dim_x); - const framework::DDim stride_y = phi::stride(dim_y); - const framework::DDim stride_dout = phi::stride(dim_dout); - - const int64_t* p_stride_x = nullptr; - const int64_t* p_stride_y = nullptr; - const int64_t* p_stride_dout = nullptr; - const int64_t* p_shape_y = nullptr; -#if defined(__NVCC__) || defined(__HIPCC__) - thrust::device_vector d_stride_x(ndims); - thrust::device_vector d_stride_y(ndims); - thrust::device_vector d_stride_dout(ndims); - thrust::device_vector d_shape_y(ndims); - thrust::copy(stride_x.Get(), stride_x.Get() + ndims, d_stride_x.begin()); - thrust::copy(stride_y.Get(), stride_y.Get() + ndims, d_stride_y.begin()); - thrust::copy(stride_dout.Get(), stride_dout.Get() + ndims, - d_stride_dout.begin()); - thrust::copy(dim_y.Get(), dim_y.Get() + ndims, d_shape_y.begin()); - - p_stride_x = thrust::raw_pointer_cast(d_stride_x.data()); - p_stride_y = thrust::raw_pointer_cast(d_stride_y.data()); - p_stride_dout = thrust::raw_pointer_cast(d_stride_dout.data()); - p_shape_y = thrust::raw_pointer_cast(d_shape_y.data()); -#else - p_stride_x = stride_x.Get(); - p_stride_y = stride_y.Get(); - p_stride_dout = stride_dout.Get(); - p_shape_y = dim_y.Get(); -#endif - // dout_x: dout * kron(ones(X), Y) re-aranged in shape (numel_x, numel_y) - // dout_y: dout * kron(X, ones(Y)) re-aranged in shaoe (numel_y, numel_x) - framework::Tensor dout_x; - T* p_dout_x = nullptr; - if (dx) { - dout_x.mutable_data({numel_x, numel_y}, dev_ctx.GetPlace()); - p_dout_x = dout_x.data(); - } - framework::Tensor dout_y; - T* p_dout_y = nullptr; - if (dy) { - dout_y.mutable_data({numel_y, numel_x}, dev_ctx.GetPlace()); - p_dout_y = dout_y.data(); - } - - platform::ForRange for_range(dev_ctx, numel); - KronGradElemFunctor func(dout.data(), x.data(), y.data(), - p_dout_x, p_dout_y, p_stride_dout, p_stride_x, - p_stride_y, p_shape_y, numel_x, numel_y, ndims); - for_range(func); - -// reduce_sum along aixs 1 -#if defined(__NVCC__) || defined(__HIPCC__) - auto stream = dev_ctx.stream(); // it is a cuda device_context - if (dx) { - TensorReduceImpl>( - dev_ctx, dout_x, dx, kps::IdentityFunctor(), {1}, stream); - } - if (dy) { - TensorReduceImpl>( - dev_ctx, dout_y, dy, kps::IdentityFunctor(), {1}, stream); - } -#else - auto* place = dev_ctx.eigen_device(); - Eigen::array reduce_dim = {1}; - if (dx) { - auto eigen_dout_x = framework::EigenMatrix::Reshape(dout_x, 1); - auto eigen_vec_dx = framework::EigenVector::Flatten(*dx); - eigen_vec_dx.device(*place) = eigen_dout_x.sum(reduce_dim); - } - if (dy) { - auto eigen_dout_y = framework::EigenMatrix::Reshape(dout_y, 1); - auto eigen_vec_dy = framework::EigenVector::Flatten(*dy); - eigen_vec_dy.device(*place) = eigen_dout_y.sum(reduce_dim); - } -#endif - } -}; - -inline framework::Tensor UnsqueezeTo(const framework::Tensor& src, int ndims) { - const framework::DDim& shape = src.dims(); - int rank = shape.size(); - framework::Tensor res; - res.ShareDataWith(src); - PADDLE_ENFORCE_LE( - rank, ndims, - platform::errors::InvalidArgument( - "The input Tensor's rank should be less than or equal to ndims" - "Received input Tensor's rank = %d, ndims = %d", - rank, ndims)); - if (rank < ndims) { - std::vector new_dim(ndims, 1); - for (int i = ndims - rank; i < ndims; i++) { - new_dim[i] = shape[i - ndims + rank]; - } - res.Resize(phi::make_ddim(new_dim)); - } - return res; -} - -template -class KronKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int ndims = out->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - KronOpFunctor func; - func(dev_ctx, xx, yy, out); - } -}; - -template -class KronGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& ctx) const { - auto& dev_ctx = ctx.template device_context(); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - if (dx) { - dx->mutable_data(ctx.GetPlace()); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - } - - int ndims = dout->dims().size(); - framework::Tensor xx = UnsqueezeTo(*x, ndims); - framework::Tensor yy = UnsqueezeTo(*y, ndims); - - framework::Tensor* pdxx = nullptr; - framework::Tensor* pdyy = nullptr; - framework::Tensor dxx; - framework::Tensor dyy; - if (dx) { - dxx = UnsqueezeTo(*dx, ndims); - pdxx = &dxx; - } - - if (dy) { - dyy = UnsqueezeTo(*dy, ndims); - pdyy = &dyy; - } - - KronGradOpFunctor func; - func(dev_ctx, *dout, xx, yy, pdxx, pdyy); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu index 4f30c58d375008abb3509989f90bcd9fec91fb38..f6f56f70f1a11971b31e679ef879f2d1d0a96085 100644 --- a/paddle/fluid/operators/kthvalue_op.cu +++ b/paddle/fluid/operators/kthvalue_op.cu @@ -16,7 +16,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/kthvalue_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" #ifdef __NVCC__ #include "cub/cub.cuh" #endif diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 62c21dd2eee401e5f8a526870015c18cf13ee873..412ae3c49b5f3cc9fc2422aa220af324e6d99b69 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -22,10 +22,10 @@ limitations under the License. */ namespace cub = hipcub; #endif -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" namespace paddle { namespace operators { @@ -186,8 +186,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( const ScaleT *__restrict__ gamma_ptr, const ScaleT *__restrict__ beta_ptr, U *__restrict__ mean_out_ptr, U *__restrict__ var_out_ptr, T *__restrict__ y_ptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -203,8 +203,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec_scale beta[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); - platform::Load(beta_ptr + col * VecSize, &beta[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(beta_ptr + col * VecSize, &beta[it]); col += THREADS_PER_ROW; } @@ -213,8 +213,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( Vec x[LDGS]; #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); col += THREADS_PER_ROW; } U xf[LDGS * VecSize]; @@ -276,8 +275,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_fwd_1024_kernel( #pragma unroll for (int it = 0, col = c; it < LDGS; it++) { - platform::Store(x[it], - y_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], y_ptr + row * LN_NUM_COLS + col * VecSize); col += THREADS_PER_ROW; } } @@ -401,9 +399,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( U *__restrict__ dgamma_temp_ptr, U *__restrict__ dbeta_temp_ptr, T *__restrict__ dx_ptr, const MaskType *mask_ptr = nullptr, T factor = static_cast(0), T *d_dropout_src_ptr = nullptr) { - using Vec = platform::AlignedVector; - using Vec_scale = platform::AlignedVector; - using MaskLoadT = platform::AlignedVector; + using Vec = phi::AlignedVector; + using Vec_scale = phi::AlignedVector; + using MaskLoadT = phi::AlignedVector; const int tidx = threadIdx.x; const int bidx = blockIdx.x; @@ -439,7 +437,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(gamma_ptr + col * VecSize, &gamma[it]); + phi::Load(gamma_ptr + col * VecSize, &gamma[it]); col += THREADS_PER_ROW; } @@ -452,12 +450,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( int col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, - &dout[it]); - platform::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, - &x[it]); + phi::Load(dout_ptr + row * LN_NUM_COLS + col * VecSize, + &dout[it]); + phi::Load(x_ptr + row * LN_NUM_COLS + col * VecSize, &x[it]); if (isFusedDropoutResidualLn) { - platform::Load( + phi::Load( mask_ptr + row * LN_NUM_COLS + col * VecSize, &mask_vec[it]); } @@ -552,10 +549,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel( col = c; #pragma unroll for (int it = 0; it < LDGS; it++) { - platform::Store(x[it], - dx_ptr + row * LN_NUM_COLS + col * VecSize); + phi::Store(x[it], dx_ptr + row * LN_NUM_COLS + col * VecSize); if (isFusedDropoutResidualLn) { - platform::Store( + phi::Store( dout[it], d_dropout_src_ptr + row * LN_NUM_COLS + col * VecSize); } col += THREADS_PER_ROW; @@ -641,7 +637,7 @@ template < __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( const int rows, U *__restrict__ dg_part_, U *__restrict__ db_part_, ScaleT *__restrict__ dg_, ScaleT *__restrict__ db_) { - using Vec = platform::AlignedVector; + using Vec = phi::AlignedVector; static_assert(VEC_COLS == LN_NUM_COLS / VecSize, ""); const int tidx = threadIdx.x; @@ -669,8 +665,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void ln_bwd_1024_final_kernel( for (int row = r; row < rows; row += ROWS_PER_CTA) { Vec dg; Vec db; - platform::Load(dg_part_ptr, &dg); - platform::Load(db_part_ptr, &db); + phi::Load(dg_part_ptr, &dg); + phi::Load(db_part_ptr, &db); dg_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; db_part_ptr += ROWS_PER_CTA * LN_NUM_COLS; diff --git a/paddle/fluid/operators/lerp_op.cc b/paddle/fluid/operators/lerp_op.cc index 0aaefc7ca75eb0f98e35200f0a1940aae07315b2..5e053445379118b37c9b0e0bdcb01adaec65b6c1 100644 --- a/paddle/fluid/operators/lerp_op.cc +++ b/paddle/fluid/operators/lerp_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -20,49 +23,6 @@ namespace operators { class LerpOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "lerp"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "lerp"); - OP_INOUT_CHECK(ctx->HasInput("Weight"), "Input", "Weight", "lerp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "lerp"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - auto w_dims = ctx->GetInputDim("Weight"); - framework::DDim out_dims; - out_dims = GetOutputDims(x_dims, y_dims); - if (w_dims.size() > 1 || w_dims[0] != 1) { - out_dims = GetOutputDims(out_dims, w_dims); - } - - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - - private: - framework::DDim GetOutputDims(const framework::DDim& s_dims, - const framework::DDim& l_dims) const { - if (s_dims.size() > l_dims.size()) { - return GetOutputDims(l_dims, s_dims); - } - std::vector shapes = phi::vectorize(l_dims); - for (int i = s_dims.size() - 1, j = l_dims.size() - 1; i >= 0; --i, --j) { - int64_t s = s_dims[i]; - int64_t l = l_dims[j]; - if (s != l) { - if (l == 1) { - shapes[j] = s; - } else if (s != 1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of tensor a %s:%d must match shape of tensor b " - "%s:%d.", - s_dims.to_str(), i, l_dims.to_str(), j)); - } - } - } - return phi::make_ddim(shapes); - } }; class LerpOpMaker : public framework::OpProtoAndCheckerMaker { @@ -125,10 +85,12 @@ DECLARE_INPLACE_OP_INFERER(LerpInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(lerp, LerpInferShapeFunctor, + PD_INFER_META(phi::LerpInferMeta)); REGISTER_OPERATOR( lerp, paddle::operators::LerpOp, paddle::operators::LerpOpMaker, paddle::operators::LerpOpGradMaker, paddle::operators::LerpOpGradMaker, - paddle::operators::LerpInplaceInferer); + paddle::operators::LerpInplaceInferer, LerpInferShapeFunctor); REGISTER_OPERATOR(lerp_grad, paddle::operators::LerpGradOp); diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc index 148fb05afcfd9a4ef1fcbc587a2bd33947a41000..72c6b41efa98922b4ba23fa4b6e1a83f931c701e 100644 --- a/paddle/fluid/operators/lgamma_op.cc +++ b/paddle/fluid/operators/lgamma_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/lgamma_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -35,16 +38,6 @@ $$out = log\Gamma(x)$$ class LgammaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma"); - - auto in_dims = ctx->GetInputDim("X"); - - ctx->SetOutputDim("Out", in_dims); - ctx->ShareLoD("X", "Out"); - } }; template @@ -83,17 +76,12 @@ class LgammaGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(lgamma, LgammaInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker, ops::LgammaGradMaker, - ops::LgammaGradMaker); + ops::LgammaGradMaker, + LgammaInferShapeFunctor); REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp); - -REGISTER_OP_CPU_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel) - -REGISTER_OP_CPU_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu deleted file mode 100644 index b9f273727b00bb5ec4398bf82b0a19737ee2387a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/lgamma_op.cu +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/lgamma_op.h" - -namespace paddle { -namespace operators { - -template -struct CudaLgammaFunctor { - __device__ __forceinline__ T operator()(const T x) const { - return Eigen::numext::lgamma(x); - } -}; - -template -class LgammaKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - auto& dev_ctx = context.device_context(); - std::vector ins = {x}; - std::vector outs = {out}; - auto functor = CudaLgammaFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - lgamma, ops::LgammaKernel, - ops::LgammaKernel); - -REGISTER_OP_CUDA_KERNEL( - lgamma_grad, - ops::LgammaGradKernel, - ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h deleted file mode 100644 index 674054e74573208ea9bbd537419d202e1a30d8c0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/lgamma_op.h +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct LgammaFunctor { - LgammaFunctor(const T* input, T* output, int64_t numel) - : input_(input), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = Eigen::numext::lgamma(input_[idx]); - } - - private: - const T* input_; - T* output_; - int64_t numel_; -}; - -template -struct LgammaGradFunctor { - LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) - : dout_(dout), x_(x), output_(output), numel_(numel) {} - - HOSTDEVICE void operator()(int64_t idx) const { - output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]); - } - - private: - const T* dout_; - const T* x_; - T* output_; - int64_t numel_; -}; - -using Tensor = framework::Tensor; - -template -class LgammaKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - - auto numel = x->numel(); - auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace(), - size_t(x->numel() * sizeof(T))); - - auto& dev_ctx = context.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaFunctor functor(x_data, out_data, numel); - for_range(functor); - } -}; - -template -class LgammaGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const framework::Tensor* d_out = - ctx.Input(framework::GradVarName("Out")); - const framework::Tensor* x = ctx.Input("X"); - framework::Tensor* d_x = - ctx.Output(framework::GradVarName("X")); - - auto numel = d_out->numel(); - auto* dout_data = d_out->data(); - auto* x_data = x->data(); - auto* dx_data = d_x->mutable_data( - ctx.GetPlace(), static_cast(numel * sizeof(T))); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, numel); - LgammaGradFunctor functor(dout_data, x_data, dx_data, numel); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index fe271fa5e893a750bdbbdc05ac4b7835205ebe66..378c7573d6129abc28bd53dd6f964e5c726cce34 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/linspace_op.h" #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace"); - - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = ctx->GetInputDim("Stop"); - PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = ctx->GetInputDim("Num"); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), true, - platform::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker); -REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel); +DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor, + PD_INFER_META(phi::LinspaceInferMeta)); +REGISTER_OPERATOR( + linspace, ops::LinspaceOp, ops::LinspaceOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + LinspaceInferShapeFunctor); REGISTER_OP_VERSION(linspace) .AddCheckpoint( diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu deleted file mode 100644 index aa625a7f5b9df0aa76872c56a3769f1186125bf5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/linspace_op.cu +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/linspace_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void LinspaceKernel(T start, T stop, double step, int64_t size, - T* out) { - int64_t index = blockIdx.x * blockDim.x + threadIdx.x; - - for (; index < size; index += blockDim.x * gridDim.x) { - if (index < size / 2) { - out[index] = static_cast(start + step * index); - } else { - out[index] = static_cast(stop - step * (size - index - 1)); - } - } -} - -template -__global__ void LinspaceSpecialKernel(T start, T* out) { - out[0] = static_cast(start); -} - -template -class CUDALinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - auto* num_t = context.Input("Num"); - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - framework::Tensor n_start; - framework::Tensor n_stop; - framework::Tensor n_num; - framework::TensorCopy(start_t, platform::CPUPlace(), &n_start); - T start = n_start.data()[0]; - framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop); - T stop = n_stop.data()[0]; - framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num); - int64_t num = static_cast(n_num.data()[0]); - - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - T* out_data = out->mutable_data(context.GetPlace()); - - double step = 0; - auto stream = context.cuda_device_context().stream(); - int block = 512; - int grid = (num + block - 1) / block; - if (num != 1) { - step = (static_cast(stop - start)) / (num - 1); - LinspaceKernel<<>>(start, stop, step, num, - out_data); - } else { - LinspaceSpecialKernel<<>>(start, out_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h deleted file mode 100644 index ae51f1221cc09b433e784ecaf52da69e41fc3706..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/linspace_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class CPULinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - int32_t num = context.Input("Num")->data()[0]; - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - T start = start_t.data()[0]; - T stop = stop_t.data()[0]; - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - - T* out_data = out->mutable_data(context.GetPlace()); - - if (num > 1) { - // step should be of double type for all types - double step = (static_cast(stop - start)) / (num - 1); - int half_num = num / 2; - for (int i = 0; i < num; ++i) { - if (i < half_num) { - out_data[i] = static_cast(start + step * i); - } else { - out_data[i] = static_cast(stop - step * (num - i - 1)); - } - } - } else { - out_data[0] = static_cast(start); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index df4d0ebbccd5e3fb4dd6131fb5fbcaa9056bd9d6..883e3597d8a31138a6ff1e4cfcb05a165eafc4a6 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,43 +24,6 @@ namespace operators { class LogLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss"); - - auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); - - if (ctx->IsRuntime() || - (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { - PADDLE_ENFORCE_EQ( - pred_dims, label_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be equal to the" - "dimensions of Input(Labels), but received dimensions of " - "Input(Predicted)" - "is [%s], received dimensions of Input(Labels) is [%s].", - pred_dims, label_dims)); - } - PADDLE_ENFORCE_EQ(pred_dims.size(), 2, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be 2," - "But received dimensions of Input(Predicted)" - "is [%d]", - pred_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - pred_dims[1], 1, - platform::errors::InvalidArgument( - "Each row of Input(Predicted) contains a real value, " - "so the 2nd dimension of Input(X) must be 1," - "But got [%d]", - pred_dims[1])); - } - ctx->SetOutputDim("Loss", {pred_dims[0], 1}); - ctx->ShareLoD("Predicted", "Loss"); - } }; template @@ -145,17 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor, + PD_INFER_META(phi::LogLossInferMeta)); REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, - ops::LogLossGradMaker); + ops::LogLossGradMaker, + LogLossInferShapeFunctor); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); -REGISTER_OP_CPU_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CPU_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h deleted file mode 100644 index e7985ab810b138da62390fae29eb4a6cf638c897..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/log_loss_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; - -template -class LogLossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* loss_out = ctx.Output("Loss"); - - loss_out->mutable_data(ctx.GetPlace()); - - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto loss = EigenVector::Flatten(*loss_out); - auto& place = *ctx.template device_context().eigen_device(); - - EigenLogLoss, T>::Eval( - place, loss, prediction, label, epsilon); - } -}; - -template -class LogLossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto epsilon = static_cast(ctx.Attr("epsilon")); - - auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); - auto label = EigenVector::Flatten(*ctx.Input("Labels")); - - auto* dloss = ctx.Input(framework::GradVarName("Loss")); - auto* dpred = ctx.Output(framework::GradVarName("Predicted")); - - auto dl = EigenVector::Flatten(*dloss); - auto& place = *ctx.template device_context().eigen_device(); - - if (dpred) { - dpred->mutable_data(ctx.GetPlace()); - auto dx = framework::EigenVector::Flatten(*dpred); - EigenLogLossGrad, T>::Eval( - place, dx, dl, prediction, label, epsilon); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc index 9775910bba5cf30096f395c20d9dff3b5b1e541f..f103a69707a214400bbe2734409df4d9de3902e8 100644 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ b/paddle/fluid/operators/log_loss_op_npu.cc @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc index b2e68e9870d3c4f240fe35a4cbec811aefbc13f1..aa5fdd86745d6932052347f3dc11b14e3d447d20 100644 --- a/paddle/fluid/operators/log_loss_op_xpu.cc +++ b/paddle/fluid/operators/log_loss_op_xpu.cc @@ -10,11 +10,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/log_loss_op.h" #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class LogLossXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 8770abdac838f63b0c9f3a95b1ac0283a80ecbf2..26b6ce43303d181c41b60cf36c229d00acb0e626 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -12,459 +12,43 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include "paddle/fluid/operators/log_softmax_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/kernels/funcs/elementwise_functor.h" -#include "paddle/phi/kernels/funcs/functors.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { namespace operators { -#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxForwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - dst, src, outer_size, dim_size); \ - break; - -template -__device__ __forceinline__ T WarpReduceSum(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = value + sum_val; - } - return value; -} - -template -__device__ __forceinline__ T WarpReduceMax(T value) { -#pragma unroll - for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { - T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); - value = max(value, max_val); - } - return value; -} - -int GetNearGreaterPowerOfTwo(int value) { - int log2_value = 0; - while ((1 << log2_value) < value) { - ++log2_value; - } - return 1 << log2_value; -} - -template -__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src, - int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT elements[warp_iter]; - // set effective_element_count as the num of elements when warps do effective - // work - // set effective_element_count as 0, when warps do ineffective work - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - elements[it] = - static_cast(src[batch_id * element_count + element_index]); - } else { - elements[it] = -std::numeric_limits::infinity(); - } - } - - // 2.compute max_value. For each thread, loop all registers to find max - AccT max_value = elements[0]; -#pragma unroll - for (int it = 1; it < warp_iter; ++it) { - max_value = (max_value > elements[it]) ? max_value : elements[it]; - } - max_value = WarpReduceMax(max_value); - - // 3.For each warp, accumulate all thread registers - AccT sum = 0.0f; -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - sum += std::exp(elements[it] - max_value); - } - sum = WarpReduceSum(sum); - - // 4.store result. - sum = std::log(sum); -#pragma unroll - for (int it = 0; it < warp_iter; ++it) { - int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < effective_element_count) { - dst[batch_id * element_count + element_index] = - static_cast(elements[it] - max_value - sum); - } else { - break; - } - } -} - -template -void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_FORWAR_COMPUTE(1); - LAUNCH_WARP_FORWAR_COMPUTE(2); - LAUNCH_WARP_FORWAR_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_FORWAR_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_FORWAR_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_FORWAR_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_FORWAR_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_FORWAR_COMPUTE(128); // dim_size 65~128 - LAUNCH_WARP_FORWAR_COMPUTE(256); // dim_size 129~256 - LAUNCH_WARP_FORWAR_COMPUTE(512); // dim_size 257~512 - LAUNCH_WARP_FORWAR_COMPUTE(1024); // dim_size 513~1024 - - default: - break; - } -} - -// Returns the final item after reduce operation along block.x. -// Firstly, get shared memory(smem) offset, find the starting position for every -// y. -// Secondly, initialise every smem position with value 'val' of thread itself. -// Thirdly, apply standard reduction along x direction as below: -// -// -> x direction -// [o o o o o o o o] time 0 -// | |/ / -// | /| / -// | / | / -// |/ |/ -// [o o o o x x x x] time 1 -// | |/ / -// |/|/ -// [o o x x x x x x] time 2 -// |/ -// [o x x x x x x x] time 3 -// -// Finally, return the first item. -// Imaging multiple reductions executed in paralell along y axis, -// Note that when blockDim.x is not 1, it's a EVEN number in all cases, -// and the size of shared memory is even as well. -template class Functor> -__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) { - Functor func; - // This reduction is not Block-wise reduction, only reduce along block.x. - // therefore the shared mem has offsets for different block.y. - shared += threadIdx.y * blockDim.x; - shared[threadIdx.x] = val; - int offset = blockDim.x / 2; - - while (offset > 0) { - __syncthreads(); - if (threadIdx.x < offset) { - shared[threadIdx.x] = - func(shared[threadIdx.x], shared[threadIdx.x + offset]); - } - offset /= 2; - } - __syncthreads(); - return shared[0]; -} - -template -__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis( - T *output, const T *input, int outer_size, int dim_size, int inner_size) { - extern __shared__ unsigned char smem[]; - auto sdata = reinterpret_cast(smem); - - const int outer_stride = inner_size * dim_size; - const int dim_stride = inner_size; - - for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) { - for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size; - y_id += blockDim.y * gridDim.y) { - const int data_offset = x_id * outer_stride + y_id; - // When blockDim.x==1, no block.x-reduction opetaions are needed. - // And threadIdx.x is 0 all the time, so the for-loops below are literally - // loops (No parallel executions). Loop all elements along axis and - // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final - // log_softmax values along that axis. - // 1. reduce max - AccT max_value = -std::numeric_limits::infinity(); - // For one thread, iterate all items it responsable for, and get - // max_value. - // If there are N threads, N max_value will be returned. - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - const AccT value = - static_cast(input[data_offset + d * dim_stride]); - max_value = phi::funcs::MaxFunctor()(max_value, value); - } - // If there are more than 1 threads along block x, reduce all max_values - // and get the global max_value, which is the max value along "axis". - // If there is only one thread along block x, no need to reduce, as the - // 'max_value' is the global max_value. - if (blockDim.x > 1) { - max_value = BlockReduceAlongDimX( - sdata, max_value); - } - - // 2. reduce sum - AccT sum = 0; - // Below is the same execution as '1. reduce max' - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - sum += std::exp(static_cast(input[data_offset + d * dim_stride]) - - max_value); - } - if (blockDim.x > 1) { - sum = BlockReduceAlongDimX(sdata, sum); - } - - // 3. input-max-log_sum and write to output - for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { - output[data_offset + d * dim_stride] = static_cast( - static_cast(input[data_offset + d * dim_stride]) - max_value - - std::log(sum)); - } - } - } -} - -// block.y covers inner_size. Threads along the x axis process dim_size -// elements, and make sure not to exceed the 1024 threads per block. -// Note that dim_threads namely blockDim.x is either 1 or a even number. -inline dim3 GetBlockSize(int dim_size, int inner_size) { - int inner_threads = inner_size; - inner_threads = std::min(inner_threads, 1024); - int dim_threads = 1; - - while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) { - dim_threads *= 2; - } - dim_threads /= 2; - return dim3(dim_threads, inner_threads); -} - -// First cover the y axis as many blocks as possible. -// Then cover the x axis as many blocks as possible, -// and make sure not to exceed the max_active_blocks. -inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size, - int dim_size, int inner_size) { - int inner_blocks = (inner_size + block.y - 1) / block.y; - if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks; - - int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks; - if (outer_blocks > outer_size) outer_blocks = outer_size; - return dim3(outer_blocks, inner_blocks); -} - -// When designing grid size and block size, priority is given to block size, -// and grid will be determined according to the maximum number of active blocks, -// which is set by as a experience value. -template -void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size, - int inner_size, dim3 &grid, dim3 &block, - int &shared_mem, int num_sm) { - block = GetBlockSize(dim_size, inner_size); - int block_threads = block.x * block.y; - shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T); - int max_active_blocks = num_sm * 2; - grid = - GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size); -} - -template -void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, - const T *input_data, - int outer_size, int dim_size, - int inner_size, int num_sm, - gpuStream_t stream) { - int shared_mem; - dim3 grid; - dim3 block; - - ComputeLaunchConfigure( - &LogSoftmaxForwardCUDAKernelNotLastAxis, outer_size, dim_size, - inner_size, grid, block, shared_mem, num_sm); - - LogSoftmaxForwardCUDAKernelNotLastAxis< - T, MPDType><<>>( - output_data, input_data, outer_size, dim_size, inner_size); -} +using Tensor = framework::Tensor; template class LogSoftmaxKernel : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *x = context.Input("X"); - auto *out = context.Output("Out"); - const auto *input_data = x->data(); - auto *output_data = out->mutable_data(context.GetPlace()); - - const int rank = x->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); - int dim_size = x->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < x->dims().size(); ++i) { - inner_size *= x->dims()[i]; - } - int outer_size = SizeToAxis(axis, x->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - int num_sm = context.cuda_device_context().GetSMCount(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxForwardForLastAxis(output_data, input_data, - dim_size, outer_size, stream); - } else { - LaunchLogSoftmaxForwardCUDAKernelNotLastAxis( - output_data, input_data, outer_size, dim_size, inner_size, num_sm, - stream); - } + int input_axis = ctx.Attr("axis"); + auto &dev_ctx = ctx.template device_context(); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); } }; -// Backward below -#define LAUNCH_WARP_BACKWARD_COMPUTE(near_greater_power_of_two) \ - case near_greater_power_of_two: \ - ComputeLogSoftmaxBackwardInWarp< \ - T, AccT, near_greater_power_of_two><<>>( \ - output, grad_output, grad_input, outer_size, dim_size); \ - break; - -template -__global__ void ComputeLogSoftmaxBackwardInWarp(const T *output, - const T *grad_output, - T *grad_input, int batch_size, - int element_count) { - constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; - constexpr int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; - int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - - int thread_in_warp_idx = threadIdx.x; - - // 1.read data from global memory to registers - AccT output_register[warp_iter]; - AccT grad_output_register[warp_iter]; - int effective_element_count = (batch_id < batch_size) ? element_count : 0; - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - output_register[iter] = - static_cast(output[batch_id * element_count + element_index]); - grad_output_register[iter] = static_cast( - grad_output[batch_id * element_count + element_index]); - } else { - output_register[iter] = static_cast(0); - grad_output_register[iter] = static_cast(0); - } - } - - // 2. For each warp, accumulate all thread registers - AccT sum = grad_output_register[0]; -#pragma unroll - for (int iter = 1; iter < warp_iter; ++iter) { - sum += grad_output_register[iter]; - } - sum = WarpReduceSum(sum); - -// 3. write result in grad_input -#pragma unroll - for (int iter = 0; iter < warp_iter; ++iter) { - int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < effective_element_count) { - grad_input[batch_id * element_count + element_index] = static_cast( - (grad_output_register[iter] - std::exp(output_register[iter]) * sum)); - } - } -} - -template -void LaunchSoftmaxBackwardForLastAxis(T *grad_input, const T *grad_output, - const T *output, int dim_size, - int outer_size, gpuStream_t stream) { - int threads_per_block = 128; - int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); - int kernel_warp_size = - (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; - int warps_per_block = (threads_per_block / kernel_warp_size); - int blocks = (outer_size + warps_per_block - 1) / warps_per_block; - dim3 threads(kernel_warp_size, warps_per_block, 1); - - switch (near_greater_power_of_two) { - LAUNCH_WARP_BACKWARD_COMPUTE(1); // dim_size: 1 - LAUNCH_WARP_BACKWARD_COMPUTE(2); // dim_size: 2 - LAUNCH_WARP_BACKWARD_COMPUTE(4); // dim_size: 3~4 - LAUNCH_WARP_BACKWARD_COMPUTE(8); // dim_size: 5~8 - LAUNCH_WARP_BACKWARD_COMPUTE(16); // dim_size: 9~16 - LAUNCH_WARP_BACKWARD_COMPUTE(32); // dim_size: 17~32 - LAUNCH_WARP_BACKWARD_COMPUTE(64); // dim_size: 33~64 - LAUNCH_WARP_BACKWARD_COMPUTE(128); // dim_size: 65~128 - LAUNCH_WARP_BACKWARD_COMPUTE(256); // dim_size: 129~256 - LAUNCH_WARP_BACKWARD_COMPUTE(512); // dim_size: 257~512 - LAUNCH_WARP_BACKWARD_COMPUTE(1024); // dim_size: 513~1024 - - default: - break; - } -} - template class LogSoftmaxGradKernel : public framework::OpKernel { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - public: - void Compute(const framework::ExecutionContext &context) const override { - const auto *out = context.Input("Out"); - const auto *d_out = - context.Input(framework::GradVarName("Out")); - auto *d_x = context.Output(framework::GradVarName("X")); + void Compute(const framework::ExecutionContext &ctx) const override { + auto *out = ctx.Input("Out"); + auto *dout = ctx.Input(framework::GradVarName("Out")); + auto *dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); - const auto *out_data = out->data(); - const auto *d_out_data = d_out->data(); - auto *d_x_data = d_x->mutable_data(context.GetPlace()); - - const int rank = out->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - - int dim_size = out->dims()[axis]; - int inner_size = 1; - for (int i = axis + 1; i < out->dims().size(); ++i) { - inner_size *= out->dims()[i]; - } - int outer_size = SizeToAxis(axis, out->dims()); - gpuStream_t stream = context.cuda_device_context().stream(); - - if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { - LaunchSoftmaxBackwardForLastAxis( - d_x_data, d_out_data, out_data, dim_size, outer_size, stream); - } else { - LogSoftmaxGradFunctor()( - context.template device_context(), out, - d_out, d_x, axis); - } + int input_axis = ctx.Attr("axis"); + auto &dev_ctx = ctx.template device_context(); + phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, + input_axis, dx); } }; @@ -473,6 +57,17 @@ class LogSoftmaxGradKernel namespace ops = paddle::operators; namespace plat = paddle::platform; + +#ifdef PADDLE_WITH_HIP +REGISTER_OP_CUDA_KERNEL( + log_softmax, ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel, + ops::LogSoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + log_softmax_grad, ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel, + ops::LogSoftmaxGradKernel); +#else REGISTER_OP_CUDA_KERNEL( log_softmax, ops::LogSoftmaxKernel, ops::LogSoftmaxKernel, @@ -483,3 +78,4 @@ REGISTER_OP_CUDA_KERNEL( ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel, ops::LogSoftmaxGradKernel); +#endif diff --git a/paddle/fluid/operators/lstsq_op.cu b/paddle/fluid/operators/lstsq_op.cu index 92c9857f0b942f00c348a6199ea4b9789b398328..10e2867bf2953f5c6fbc3d50bd8156fa3b0266e9 100644 --- a/paddle/fluid/operators/lstsq_op.cu +++ b/paddle/fluid/operators/lstsq_op.cu @@ -17,9 +17,11 @@ #include #include +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/lstsq_op.h" #include "paddle/fluid/operators/qr_op.h" #include "paddle/fluid/platform/dynload/cusolver.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -70,6 +72,10 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor tau = dito.Fill(tau_dims_vec, 0); auto tau_data = tau.mutable_data(context.GetPlace()); + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m >= n) { Tensor tmp_x = dito.Transpose(new_x); Tensor tmp_y = dito.Transpose(new_y); @@ -93,8 +99,9 @@ class LstsqCUDAKernel : public framework::OpKernel { Tensor slice_y = dito.Slice(trans_y, {-2}, {0}, {min_mn}); // Step 3, solve R X = Y - triangular_solve(dev_ctx, res_r, slice_y, solution, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, res_r, slice_y, true, + false, false, solution); + } else { auto x_data = new_x.mutable_data(context.GetPlace()); auto y_data = new_y.mutable_data(context.GetPlace()); @@ -105,8 +112,8 @@ class LstsqCUDAKernel : public framework::OpKernel { // Step 2, solve R^H Z = Y Tensor trans_r = dito.Transpose(new_x); - triangular_solve(dev_ctx, trans_r, new_y, solution, - true, true, false); + phi::TriangularSolveKernel(phi_dev_ctx, trans_r, new_y, true, + true, false, solution); // Step 3, X <- Q Z BatchedOrgqr(dev_ctx, batch_count, n, n, min_mn, x_data, diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h index 3cbbc62e7bec92f329535e788f19d439c9341a0e..520722dafcbea3ce8c545389317516cc22f7689f 100644 --- a/paddle/fluid/operators/lstsq_op.h +++ b/paddle/fluid/operators/lstsq_op.h @@ -22,7 +22,6 @@ #include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index f323e2e041d994eb01c9d4e934984b8a005ffcec..214b2eccae9f75e9bfcfa3df0b823918e2b0c353 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -15,12 +15,13 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/phi/kernels/funcs/lapack/lapack_function.h" #include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/triangular_solve_kernel.h" namespace paddle { namespace operators { @@ -555,6 +556,11 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor Pmat; Unpack_Pivot(dev_ctx, *P, &Pmat, m, k); + + using Context = + typename framework::ConvertToPhiContext::TYPE; + auto& phi_dev_ctx = static_cast(dev_ctx); + if (m <= n) { if (k < n) { framework::Tensor U_complement, U_grad_complement, phi_complement, @@ -605,8 +611,9 @@ class LUGradKernel : public framework::OpKernel { framework::Tensor psi_principal, phi_mH, psi_tmp; Tensor_Conj(dev_ctx, phi, &phi_mH); phi_mH = helper.Transpose(phi_mH); - triangular_solve(dev_ctx, U_narrow, phi_mH, - &psi_principal, true, false, false); + + phi::TriangularSolveKernel( + phi_dev_ctx, U_narrow, phi_mH, true, false, false, &psi_principal); Tensor_Conj(dev_ctx, psi_principal, &psi_principal); psi_principal = helper.Transpose(psi_principal); @@ -620,8 +627,9 @@ class LUGradKernel : public framework::OpKernel { SetValueCompute_dispatch(ctx, &psi, &psi_principal, &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); - triangular_solve(dev_ctx, L_narrow_mH, psi, &psi_tmp, - true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, psi, + true, false, true, &psi_tmp); auto mat_dim_p = phi::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false); @@ -672,8 +680,10 @@ class LUGradKernel : public framework::OpKernel { &psi, axes, &slice_starts, &slice_ends, valuedims, xrank); framework::Tensor psi_principal, phi_mH, psi_tmp, U_narrow_mH; - triangular_solve(dev_ctx, L_narrow_mH, phi, - &psi_principal, true, false, true); + + phi::TriangularSolveKernel(phi_dev_ctx, L_narrow_mH, phi, + true, false, true, &psi_principal); + slice_starts[0] = 0; slice_starts[1] = 0; slice_ends[0] = k; @@ -695,8 +705,8 @@ class LUGradKernel : public framework::OpKernel { psi_tmp = helper.Transpose(psi_tmp); Tensor_Conj(dev_ctx, U_narrow, &U_narrow_mH); - triangular_solve(dev_ctx, U_narrow_mH, psi_tmp, &psi, - true, false, false); + phi::TriangularSolveKernel(phi_dev_ctx, U_narrow_mH, psi_tmp, + true, false, false, &psi); *dx = helper.Transpose(psi); } } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index ba047355ad7e0e7991e841cecd79e7e0b03c5911..af1069cb867993160d7346779d7de8161e37438c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -5,6 +5,8 @@ endif() # please add new math_library in alphabetical order if (WITH_ASCEND_CL) math_library(concat_and_split DEPS concat_and_split_functor npu_op_runner) +elseif (WITH_MLU) +math_library(concat_and_split DEPS concat_and_split_functor mlu_baseop) else() math_library(concat_and_split DEPS concat_and_split_functor) endif() @@ -18,7 +20,6 @@ math_library(sampler DEPS generator) # math_library(math_function DEPS blas dense_tensor tensor) math_library(maxouting) -math_library(pooling) if(WITH_MKLDNN) math_library(selected_rows_functor DEPS selected_rows_utils math_function blas mkldnn_axpy_handler) @@ -44,8 +45,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(matrix_inverse) -math_library(segment_pooling) math_library(matrix_solve) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) @@ -70,7 +69,6 @@ if(WITH_GPU AND (NOT WITH_ROCM)) endif() endif() -cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) if(WITH_TESTING AND TEST im2col_test) set_tests_properties(im2col_test PROPERTIES TIMEOUT 120) endif() diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 46126ac59c892787d2f63956983404843e518ae7..c9308d27c0a3490d9c0094f45a1a9c2d894bbf57 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -18,6 +18,9 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#endif #include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/float16.h" @@ -226,6 +229,90 @@ class SplitFunctor { }; #endif +#ifdef PADDLE_WITH_MLU +template +class ConcatFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const std::vector& input, int axis, + framework::Tensor* output) { + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto ins_size = input.size(); + + const int axis_t = axis; + const int ins_size_t = ins_size; + auto place = context.GetPlace(); + output->mutable_data(place); + + // mlu should do sth + // init ins tensors + std::vector inputs; + std::vector input_descs; + std::vector desc_vector; + for (size_t i = 0; i < ins_size; i++) { + input_descs.emplace_back(MLUCnnlTensorDesc( + input[i], CNNL_LAYOUT_ARRAY, ToCnnlDataType(input[i].dtype()))); + desc_vector.push_back(input_descs.back().get()); + inputs.push_back(input[i].data()); + } + // init out tensors + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + // MLU should do sth + MLUCnnl::Concat(context, ins_size_t, axis_t, desc_vector.data(), + inputs.data(), output_desc.get(), GetBasePtr(output)); + } +}; + +template +class SplitFunctor { + public: + void operator()(const platform::MLUDeviceContext& context, + const framework::Tensor& input, + const std::vector& ref_inputs, + const int axis, std::vector* outputs) { + if (input.numel() == 0) { + return; + } + + int dev_id = context.GetPlace().GetDeviceId(); + platform::MLUDeviceGuard guard(dev_id); + + auto in_dims = input.dims(); + auto out_size = outputs->size(); + + std::vector outs_dims(out_size, in_dims); + for (size_t i = 0; i < out_size; ++i) { + outs_dims[i][axis] = ref_inputs[i]->dims()[axis]; + } + + // init out tensors + std::vector vct_tensor; + std::vector output_descs; + std::vector desc_vector; + for (size_t i = 0; i < out_size; i++) { + (*outputs)[i]->Resize(outs_dims[i]); + (*outputs)[i]->mutable_data(context.GetPlace()); + output_descs.emplace_back( + MLUCnnlTensorDesc(*(*outputs)[i], CNNL_LAYOUT_ARRAY, + ToCnnlDataType((*outputs)[i]->dtype()))); + desc_vector.push_back(output_descs.back().get()); + vct_tensor.push_back(GetBasePtr((*outputs)[i])); + } + // init in tensors + MLUCnnlTensorDesc input_desc(input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input.dtype())); + + // MLU should do sth + MLUCnnl::Split(context, out_size, axis, input_desc.get(), input.data(), + desc_vector.data(), vct_tensor.data()); + } +}; +#endif + #define DEFINE_FUNCTOR(type) \ template class ConcatFunctor; \ template class SplitFunctor; @@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float) FOR_ALL_TYPES(DEFINE_NPU_FUNCTOR) #endif +#ifdef PADDLE_WITH_MLU +#define DEFINE_MLU_FUNCTOR(type) \ + template class ConcatFunctor; \ + template class SplitFunctor; +DEFINE_MLU_FUNCTOR(float) +DEFINE_MLU_FUNCTOR(platform::float16) +DEFINE_MLU_FUNCTOR(int64_t) +DEFINE_MLU_FUNCTOR(bool) +DEFINE_MLU_FUNCTOR(int) +DEFINE_MLU_FUNCTOR(int8_t) +DEFINE_MLU_FUNCTOR(int16_t) +DEFINE_MLU_FUNCTOR(uint8_t) +#endif } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h deleted file mode 100644 index e41f0aedf39ef582b4533b1eeb6ccda1e8ed7e49..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -using DataLayout = framework::DataLayout; - -/* - * \brief Compute the depthwise convolution which include - * forward process and backpropagation process - */ -template -class DepthwiseConvFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, framework::Tensor* output, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvInputGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& filter, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* input_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -template -class DepthwiseConvFilterGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output_grad, - const std::vector& strides, - const std::vector& paddings, - const std::vector& dilations, - framework::Tensor* filter_grad, - const DataLayout data_layout = DataLayout::kNCHW); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h index 9994ccc10cb13b2f692b18f16182f6bcdad7efa5..b77e23450360c836ae3efe0a6dc2c77216e660f0 100644 --- a/paddle/fluid/operators/math/inclusive_scan.h +++ b/paddle/fluid/operators/math/inclusive_scan.h @@ -34,10 +34,10 @@ namespace paddle { namespace operators { namespace math { -template +template static void CubInclusiveScan(InputIterator x_iter, OutputIterator y_iter, - size_t n, BinaryOp op, - const platform::CUDADeviceContext &dev_ctx) { + size_t n, BinaryOp op, const Context &dev_ctx) { memory::AllocationPtr allocation; void *temp_storage = nullptr; size_t temp_storage_bytes = 0; @@ -185,11 +185,10 @@ static __global__ void InclusiveScanInnerDimCUDAKernel(const T *x, T *y, } } -template +template static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, size_t inner_dim, T init, BinaryOp op, - bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + bool reverse, const Context &dev_ctx) { constexpr size_t kThreadNumX = 16; constexpr size_t kThreadNumY = 32; @@ -209,10 +208,10 @@ static void InclusiveScanInnerDim(const T *x, T *y, size_t outer_dim, } } -template +template void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, size_t inner_dim, T init, BinaryOp op, bool reverse, - const platform::CUDADeviceContext &dev_ctx) { + const Context &dev_ctx) { if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return; if (outer_dim == 1 && inner_dim == 1) { @@ -224,8 +223,7 @@ void InclusiveScan(const T *x, T *y, size_t outer_dim, size_t mid_dim, CubInclusiveScan(x, y, mid_dim, op, dev_ctx); } } else if (inner_dim != 1) { - platform::ForRange for_range( - dev_ctx, outer_dim * inner_dim); + platform::ForRange for_range(dev_ctx, outer_dim * inner_dim); if (reverse) { for_range( InclusiveScanOuterOrMidDimFunctor( diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc deleted file mode 100644 index 1b36e615c68df814015a2c308ed74b755f6bc635..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { - compute_inverse_eigen(context, a, a_inv); - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc deleted file mode 100644 index 41335a69417a94a567119bb8f37378af957be541..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor; - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { -#ifndef PADDLE_WITH_HIP - const auto& mat_dims = a.dims(); - const int rank = mat_dims.size(); - int n = mat_dims[rank - 1]; - int batch_size = rank > 2 ? a.numel() / (n * n) : 1; - - memory::allocation::AllocationPtr tmp_gpu_mat_data; - const T* gpu_mat = a.data(); - if (n >= 32) { - // Copy all elements of input matrix A to a temporary memory space to - // avoid being overriden by getrf. - tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T)); - memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(), - context.GetPlace(), a.data(), a.numel() * sizeof(T), - context.stream()); - gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); - } - - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = gpu_mat + i * n * n; - cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; - } - - // Copy the addresses of A and A_inv from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - T** gpu_inv_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - memory::allocation::AllocationPtr tmp_gpu_info_data = - memory::Alloc(context, num_ints * sizeof(int)); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); - - auto blas = phi::funcs::GetBlas(context); - - std::vector info; // only for singular checking - info.resize(batch_size); - // This functions in cuBLAS is intended to be used for matrices of small - // sizes where the launch overhead is a significant factor. - // TODO(Xreki): call function in cusolver for large matrices. - if (n < 32) { - // cublasmatinvBatched is a short cut of cublasgetrfBatched - // plus cublasgetriBatched. - // However it only works if N is less than 32. If not, we need to - // go through cublasgetrfBatched and cublasgetriBatched. - blas.BatchedMatInv(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_inv_ptrs, gpu_info_ptr, batch_size); - } else { - // This function performs the LU factorization of each matrix A by the - // equation P * A = L * U. L and U are written back to original matrix A, - // and diagonal elements of L are discarded. - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; - blas.BatchedGETRF(n, reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_info_ptr, batch_size); - - blas.BatchedGETRI(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size); - } - memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), - gpu_info_ptr, sizeof(int) * batch_size, context.stream()); - for (int i = 0; i < batch_size; ++i) { - PADDLE_ENFORCE_EQ(info[i], 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U. " - "Please check the matrix value and change it to a " - "non-singular matrix", - i, info[i], info[i])); - } -#else - compute_inverse_eigen(context, a, a_inv); -#endif - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.cc b/paddle/fluid/operators/math/matrix_solve.cc index 883ee9b148654f8621b26942739730426ba7fc7d..7b239b8166644697581d0051f12b6abacc6832fa 100644 --- a/paddle/fluid/operators/math/matrix_solve.cc +++ b/paddle/fluid/operators/math/matrix_solve.cc @@ -34,45 +34,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor* a, framework::Tensor* b, bool left, - bool upper, bool transpose, bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - for (int i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda, - b_data + i * N * M, ldb); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc index d3490ead212731f3fc6a75d61a31c11c72c9129d..737196dde1dfc26269fe083fe17037c829ef8109 100644 --- a/paddle/fluid/operators/math/matrix_solve.cu.cc +++ b/paddle/fluid/operators/math/matrix_solve.cu.cc @@ -161,67 +161,6 @@ class MatrixSolveFunctor { template class MatrixSolveFunctor; template class MatrixSolveFunctor; -template -class TriangularSolveFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, const Tensor* a, - Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular) { - CBLAS_SIDE side = left ? CblasLeft : CblasRight; - CBLAS_UPLO uplo = upper ? CblasUpper : CblasLower; - CBLAS_TRANSPOSE transA = transpose ? CblasTrans : CblasNoTrans; - CBLAS_DIAG diag = unitriangular ? CblasUnit : CblasNonUnit; - - const T* a_data = a->data(); - T* b_data = b->mutable_data(context.GetPlace()); - - int a_dim_size = a->dims().size(); - int b_dim_size = b->dims().size(); - - int M = static_cast(b->dims()[b_dim_size - 2]); - int N = static_cast(b->dims()[b_dim_size - 1]); - auto lda = left ? std::max(1, M) : std::max(1, N); - auto ldb = std::max(1, N); - - int batch_size = 1; - auto& a_dim = a->dims(); - for (int i = 0; i < a_dim_size - 2; i++) { - batch_size *= a_dim[i]; - } - - auto blas = phi::funcs::GetBlas(context); - if (batch_size <= 8 && M >= 64) { - for (auto i = 0; i < batch_size; i++) { - blas.TRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - a_data + i * M * M, lda, b_data + i * N * M, ldb); - } - } else { - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = a_data + i * M * M; - cpu_ptrs[i + batch_size] = b_data + i * M * N; - } - - // Copy the addresses of A and tmp_b from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - - const T** gpu_a_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()); - T** gpu_b_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - blas.BatchedTRSM(side, uplo, transA, diag, M, N, static_cast(1.0), - gpu_a_ptrs, lda, gpu_b_ptrs, ldb, batch_size); - } - } -}; - -template class TriangularSolveFunctor; -template class TriangularSolveFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 1dc43205592f69cc105b43fe49b2f7872f8251c3..415d0c6dd8e0cf51958783c32aa49c66cce9e15c 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -117,14 +117,6 @@ class MatrixSolveFunctor { const framework::Tensor& b, framework::Tensor* out); }; -template -class TriangularSolveFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor* a, - framework::Tensor* b, bool left, bool upper, bool transpose, - bool unitriangular); -}; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc index 45556e97d1d7afb81d626c99b078cbc215c0195f..28ec3a871022f4b9ec4dce9d9310fd630f10e473 100644 --- a/paddle/fluid/operators/math/maxouting.cc +++ b/paddle/fluid/operators/math/maxouting.cc @@ -14,106 +14,107 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace operators { namespace math { // All tensors are in NCHW or NHWC format, and the groups must be greater than 1 -template -class MaxOutFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - int fea_size = input_height * input_width; - // c_size means the output size of each sample - int c_size = fea_size * output_channels; - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int new_bindex = c_size * i; - for (int c = 0; c < output_channels; ++c) { - int new_cindex = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - T ele = static_cast(-FLT_MAX); - int input_idx, output_idx; - for (int ph = 0; ph < groups; ++ph) { - if (axis == 1) { - input_idx = - (new_bindex + new_cindex) * groups + ph * fea_size + f; - } else { - input_idx = (new_bindex + f * output_channels + c) * groups + ph; - } - T x = input_data[input_idx]; - ele = ele > x ? ele : x; - } +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + int input_idx, output_idx; + for (int ph = 0; ph < groups; ++ph) { if (axis == 1) { - output_idx = new_bindex + new_cindex + f; + input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f; } else { - output_idx = new_bindex + f * output_channels + c; + input_idx = (new_bindex + f * output_channels + c) * groups + ph; } - output_data[output_idx] = ele; + T x = input_data[input_idx]; + ele = ele > x ? ele : x; } + if (axis == 1) { + output_idx = new_bindex + new_cindex + f; + } else { + output_idx = new_bindex + f * output_channels + c; + } + output_data[output_idx] = ele; } } } -}; +} -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - int fea_size = input_height * input_width; - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; - for (int c = 0; c < output_channels; ++c) { - int clen = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - int input_idx0, output_idx; - bool continue_match = true; - if (axis == 1) { - input_idx0 = (blen + clen) * groups + f; - output_idx = blen + clen + f; - } else { - input_idx0 = (blen + f * output_channels + c) * groups; - output_idx = blen + f * output_channels + c; - } - for (int g = 0; g < groups && continue_match; ++g) { - int idx_offset = (axis == 1 ? fea_size * g : g); - int input_idx = input_idx0 + idx_offset; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - continue_match = false; - } + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0, output_idx; + bool continue_match = true; + if (axis == 1) { + input_idx0 = (blen + clen) * groups + f; + output_idx = blen + clen + f; + } else { + input_idx0 = (blen + f * output_channels + c) * groups; + output_idx = blen + f * output_channels + c; + } + for (int g = 0; g < groups && continue_match; ++g) { + int idx_offset = (axis == 1 ? fea_size * g : g); + int input_idx = input_idx0 + idx_offset; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; } } } } } -}; +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1856fb4eb48c73f96d4f6428ba890c821a61292c..1d0478db5ef4a80d955d1166ffa21ff39f6bd184 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = output->numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxOut<<>>( - nthreads, input_data, input_channels, input_height, input_width, groups, - axis, output_data); - } -}; +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + axis, output_data); +} + /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = output.numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxoutGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_grad_data, - input_channels, input_height, input_width, groups, axis); - } -}; +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups, axis); +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; @@ -157,6 +154,12 @@ template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 0d8372df8a2fec306f6091712c66d55d1e71216e..1f4964f7715426d2eab6168ae009ffbd40e1ff0a 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -30,7 +30,7 @@ class MaxOutFunctor { const int axis = 1); }; -template +template class MaxOutGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h deleted file mode 100644 index dfd3dad38644b65ef0b5e62e1b54ce210e9c489a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/pooling.h +++ /dev/null @@ -1,315 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -namespace math { - -/* - * \brief Extracting simple operations from pooling. - * Both MaxPool and AvgPool need "initial", "compute" and "finalize" - * operation. - * MaxPool initializes temp variable to the negative maximum to find the - * maximum value in the pooling field. - * AvgPool initializes temp variable to the zero to accumulate all values - * in pool pooling, and finally takes the average. - * MaxPoolGrad and AvgPoolGrad are gradient operations respectively. - */ -template -class MaxPool { - public: - DEVICE inline T initial() { return static_cast(-FLT_MAX); } - DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; } - DEVICE inline void finalize(const T& pool_field, T* y) {} -}; - -template -class AvgPool { - using MT = typename details::MPTypeTrait::Type; - MT intermediate_res; - - public: - DEVICE inline T initial() { - intermediate_res = static_cast(0.0f); - return static_cast(0); - } - - DEVICE inline void compute(const T& x, T* y) { - intermediate_res += static_cast(x); - } - - DEVICE inline void finalize(const T& pool_field, T* y) { - *y = static_cast(intermediate_res / (static_cast(pool_field))); - } -}; - -template -class MaxPoolGrad { - public: - static constexpr bool use_x = true; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += dy * static_cast(x == y); - } -}; - -template -class AvgPoolGrad { - public: - static constexpr bool use_x = false; - HOSTDEVICE inline void compute(const T& x, const T& y, const T& dy, T scale, - T* dx) { - *dx += (scale * dy); - } -}; - -/* used for adaptive pool to calculate start and end index of each divided grid - */ -HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { - return static_cast( - floor(static_cast(ph * input_size) / output_size)); -} - -HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { - return static_cast( - ceil(static_cast((ph + 1) * input_size) / output_size)); -} - -/* - * \brief Getting pooling results, and calculating gradient. - * - * In pool2d, all Tensors are in NCHW or NHWC format. Where N is batch size, C - * is the number of channels, H and W is the height and width of feature. - * In pool3d, all Tensors are in NCDHW or NDHWC format. Where N is batch size, C - * is the number of channels, D, H and W is the depth, height and width of - * feature. - * - * In max pooling, it is possible that the pooling region has multiple maximum - * elements. In this case, we should compute the gradient of the first maximum - * element. - * This is different from average pooling. So we rewrite the max_pool_grad: - * MaxPool2dGradFunctor, MaxPool3dGradFunctor. - */ -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool2dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool2dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool2dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -class Pool3dDirectCUDAFunctor { - public: - void operator()(const T* input, const std::vector& input_shape, - const std::vector& output_shape, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, T* output, gpuStream_t stream, - PoolProcess pool_compute); -}; -#endif - -template -class Pool3dFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* output, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* output, PoolProcess pool_compute); -}; - -template -class Pool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool exclusive, - bool adaptive, framework::Tensor* input_grad, - PoolProcess pool_compute); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, bool exclusive, bool adaptive, - framework::Tensor* input_grad, PoolProcess pool_compute); -}; - -template -class MaxPool3dGradFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - framework::Tensor* input_grad); - // overload operator() to support argument data_format - void operator()(const DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& output, - const framework::Tensor& output_grad, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, - const std::string data_format, framework::Tensor* input_grad); -}; - -/* - * \brief Getting max pooling results and corresponding max index, and - * calculating gradient. - * In up-sampling-pooling, it is necessary to know max element index. - * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in - * NCDHW format. - */ -template -class MaxPool2dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool2dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -template -class MaxPool3dWithIndexFunctor { - public: - void operator()(const DeviceContext& context, const framework::Tensor& input, - const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* output, framework::Tensor* mask); -}; - -template -class MaxPool3dWithIndexGradFunctor { - public: - void operator()(const DeviceContext& context, - const framework::Tensor& output_grad, - const framework::Tensor& mask, const std::vector& ksize, - const std::vector& strides, - const std::vector& paddings, bool adaptive, - framework::Tensor* input_grad); -}; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc index 42bf1f471deb5238fdb34dcd9284972930305f58..bc5a589ed6fb137c5013253a65971dcf80d4ac72 100644 --- a/paddle/fluid/operators/math/vol2col.cc +++ b/paddle/fluid/operators/math/vol2col.cc @@ -14,6 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace platform { class CPUDeviceContext; @@ -141,6 +143,116 @@ class Vol2ColFunctor { } }; +template +class Vol2ColFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& vol, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* col, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol.dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol.dims().size())); + + PADDLE_ENFORCE_EQ(col->dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col->dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]); + int filter_depth = col->dims()[1]; + int filter_height = col->dims()[2]; + int filter_width = col->dims()[3]; + int output_depth = col->dims()[4]; + int output_height = col->dims()[5]; + int output_width = col->dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + // changed + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + const T* vol_data = vol.data(); + T* col_data = col->data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int c_in = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + w; + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + c_in; + } + col_data[col_idx] = + (h_pad < 0 || h_pad >= input_height || w_pad < 0 || + w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) + ? static_cast(0) + : vol_data[vol_idx]; + } + } + } + } + } +}; + /* * vol = [input_channels,input_depth, input_height, input_width] * col = @@ -258,10 +370,125 @@ class Col2VolFunctor { } }; +template +class Col2VolFunctor { + public: + void operator()(const phi::CPUContext& context, const framework::Tensor& col, + const std::vector& dilations, + const std::vector& strides, + const std::vector& paddings, framework::Tensor* vol, + const DataLayout data_layout) const { + PADDLE_ENFORCE_EQ(vol->dims().size(), 4, + platform::errors::InvalidArgument( + "The dimension of vol should be 4, but received %d.", + vol->dims().size())); + + PADDLE_ENFORCE_EQ(col.dims().size(), 7, + platform::errors::InvalidArgument( + "The dimension of col should be 7, but received %d.", + col.dims().size())); + + int input_channels = + (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]); + int input_depth = + (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]); + int input_height = + (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]); + int input_width = + (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]); + int filter_depth = col.dims()[1]; + int filter_height = col.dims()[2]; + int filter_width = col.dims()[3]; + int output_depth = col.dims()[4]; + int output_height = col.dims()[5]; + int output_width = col.dims()[6]; + int channels_col = + input_channels * filter_depth * filter_height * filter_width; + + bool paddings_size_is_6 = (paddings.size() == 6); + int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0]; + int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0]; + int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1]; + int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1]; + int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2]; + int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2]; + + auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back - + ((dilations[0] * (filter_depth - 1) + 1))) / + strides[0] + + 1; + PADDLE_ENFORCE_EQ( + input_depth_tmp, output_depth, + platform::errors::InvalidArgument( + "input_depth(%d) and output_depth(%d) are mismatching.", + input_depth_tmp, output_depth)); + auto input_height_tmp = (input_height + pad_h_up + pad_h_down - + ((dilations[1] * (filter_height - 1) + 1))) / + strides[1] + + 1; + PADDLE_ENFORCE_EQ( + input_height_tmp, output_height, + platform::errors::InvalidArgument( + "input_height(%d) and output_height(%d) are mismatching.", + input_height_tmp, output_height)); + auto input_width_tmp = (input_width + pad_w_left + pad_w_right - + ((dilations[2] * (filter_width - 1) + 1))) / + strides[2] + + 1; + PADDLE_ENFORCE_EQ( + input_width_tmp, output_width, + platform::errors::InvalidArgument( + "input_width(%d) and output_width(%d) are mismatching.", + input_width_tmp, output_width)); + T* vol_data = vol->data(); + const T* col_data = col.data(); + + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int d_offset = (c / filter_width / filter_height) % filter_depth; + int cIm = c / filter_width / filter_height / filter_depth; + for (int d = 0; d < output_depth; ++d) { + int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0]; + for (int h = 0; h < output_height; ++h) { + int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1]; + for (int w = 0; w < output_width; ++w) { + int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2]; + + if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 && + w_pad < input_width && d_pad >= 0 && d_pad < input_depth) { + int vol_idx; + if (data_layout != DataLayout::kNHWC) { + vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) * + input_width + + w_pad; + } else { + vol_idx = + ((d_pad * input_height + h_pad) * input_width + w_pad) * + input_channels + + cIm; + } + int col_idx = + ((c * output_depth + d) * output_height + h) * output_width + + w; + vol_data[vol_idx] += col_data[col_idx]; + } + } + } + } + } + } +}; + template class Vol2ColFunctor; template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; + template class Col2VolFunctor; template class Col2VolFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 788dbb2204109dd4f215730e4234e3fec8aef702..01fa01e3c6ed04c151f709dd5fbebe387c32bde3 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -524,8 +524,8 @@ REGISTER_OPERATOR(matmul_v2, ops::MatMulV2Op, ops::MatMulV2OpMaker, ops::MatMulV2GradOpMaker, ops::MatMulV2GradOpMaker); -DELCARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, - PT_INFER_META(phi::GeneralBinaryGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(matmul_v2_grad, MatMulV2GradInferShapeFunctor, + PD_INFER_META(phi::GeneralBinaryGradInferMeta)); REGISTER_OPERATOR(matmul_v2_grad, ops::MatMulV2OpGrad, ops::MatMulV2OpDoubleGradMaker, ops::MatMulV2OpDoubleGradMaker, diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index 1524a50f1ac6d6afa67722bc5d1c16a581395bb2..87df75ac465042a0f7894abecb4be4c213e5d807 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, auto mat_dim_b = phi::funcs::CreateMatrixDescriptor( ColumnMatrixFromVector(y_dims), 0, trans_y); - if (x_dims.size() == 3 && y_dims.size() <= 2) { + if (x_dims.size() >= 3 && y_dims.size() <= 2) { // if transpose_X is true, the transpose cost much time if (!trans_x) { mat_dim_a.height_ *= mat_dim_a.batch_size_; diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index c65af3129f3646163925be95b27b9fec25207f8c..cdf204628b638f877c92e35a8941487aa39b5427 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_power_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" namespace paddle { namespace operators { @@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerGradOpMaker); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); - -REGISTER_OP_CPU_KERNEL( - matrix_power, - ops::MatrixPowerKernel, - ops::MatrixPowerKernel); - -REGISTER_OP_CPU_KERNEL( - matrix_power_grad, - ops::MatrixPowerGradKernel, - ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h deleted file mode 100644 index d2c67d80b4f5a562d47e56173ecf1ea2f99bff56..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/matrix_power_op.h +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct IdentityMatrixFunctor { - IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int row = index / m_ % m_; - const int col = index % m_; - output_[index] = col == row ? static_cast(1) : static_cast(0); - } - - const int m_; - T* output_; -}; - -template -void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, - const paddle::framework::ExecutionContext& ctx) { - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - T* out_data = Out->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, X->numel()); - - if (n == 0) { - // Out = Identity Matrix - IdentityMatrixFunctor functor(x_dims[x_ndim - 1], out_data); - for_range(functor); - return; - } - - auto blas = phi::funcs::GetBlas(dev_ctx); - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - if (new_n == 1) { - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out); - return; - } - - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (new_n == 2) { - // Out = newX * newX - Out->mutable_data(ctx.GetPlace()); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 3) { - // Out = (newX * newX) * newX - // Note: C[i] matrices in MatMul must not overlap, i.e. the individual - // gemm operations must be computable independently; otherwise, - // undefined behavior is expected. - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 4) { - // Out = (newX * newX) * (newX * newX) - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } - - // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN) - int bit = 0; - Tensor z = Tensor(X->dtype()); - bool out_inited = false; - Tensor temp_out = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor temp_z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - while (new_n > 0) { - bit = new_n & 0x1; - new_n >>= 1; - if (z.IsInitialized()) { - blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_z, static_cast(0)); - framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z); - } else { - z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z); - } - if (bit == 1) { - if (out_inited == true) { - blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_out, static_cast(0)); - framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out); - } else { - framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out); - out_inited = true; - } - } - } - return; -} - -template -class MatrixPowerKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - Tensor* Out = ctx.Output("Out"); - int n = ctx.Attr("n"); - - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - PADDLE_ENFORCE_EQ( - x_dims[x_ndim - 2], x_dims[x_ndim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) should be equal." - "X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_ndim - 2], x_dims[x_ndim - 1])); - - MatrixPowerFunction(X, n, Out, ctx); - } -}; - -template -void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, - const Tensor* dOut, const int n, Tensor* dX, - const paddle::framework::ExecutionContext& ctx) { - dX->mutable_data(ctx.GetPlace()); - const auto& x_dims = X->dims(); - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (n == 0) { - // \nabla X = O - phi::funcs::SetConstant zero; - zero(dev_ctx, dX, static_cast(0)); - return; - } else if (n == 1) { - // \nabla X = \nabla Out - framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX); - return; - } - - auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true); - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (n == -1) { - // \nabla X = Out^{T} * \nabla Out * Out^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast(1), dX, - static_cast(0)); - return; - } - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - // Use chain rule blow to compute \nabla newX^{n} - // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1}, - // Note that newX^{0} can be omitted - std::vector> tensor_list(new_n - 1); - tensor_list[0] = std::make_shared(new_x); - int index = 1; - while (index < new_n - 1) { - tensor_list[index] = std::make_shared( - ctx.AllocateTmpTensor(X->dims(), dev_ctx)); - blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc, - static_cast(1), tensor_list[index].get(), static_cast(0)); - index++; - } - - // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i} - // * \nabla Out - // * (newX^{T}^{n - i - 1}) - Tensor dx_new = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc, - static_cast(1), &dx_new, static_cast(0)); - Tensor da_an_minus1 = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc, - static_cast(1), &da_an_minus1, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), da_an_minus1.data(), - dx_new.data()); - int start = 0; - while (start < new_n - 2) { - Tensor a_da = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor a_da_a = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc, - static_cast(1), &a_da, static_cast(0)); - blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start], - trans_desc, static_cast(1), &a_da_a, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), a_da_a.data(), - dx_new.data()); - start++; - } - - if (n > 0) { - // \nabla X = \nabla newX - framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX); - } else { - // \nabla X = newX^{T} * \nabla newX * newX^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast(1), - dX, static_cast(0)); - } - return; -} - -template -class MatrixPowerGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - const Tensor* Out = ctx.Input("Out"); - const Tensor* dOut = ctx.Input(framework::GradVarName("Out")); - const int n = ctx.Attr("n"); - Tensor* dX = ctx.Output(framework::GradVarName("X")); - - MatrixPowerGradFunction(X, Out, dOut, n, dX, ctx); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/matrix_rank_op.cc b/paddle/fluid/operators/matrix_rank_op.cc index 65599259e2237387ad0dd85b5a9772733e3d7a1a..e7d08b6597360bb0431add6ae63eb99f401c8ce0 100644 --- a/paddle/fluid/operators/matrix_rank_op.cc +++ b/paddle/fluid/operators/matrix_rank_op.cc @@ -12,11 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_rank_op.h" #include #include #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/svd_helper.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -69,9 +69,9 @@ class MatrixRankeOp : public framework::OperatorWithKernel { std::vector x_batch_dims_array(max_dim); std::vector tol_dims_array(max_dim); std::vector out_dims_array(max_dim); - GetBroadcastDimsArrays(dim_x_batch, dim_tol, x_batch_dims_array.data(), - tol_dims_array.data(), out_dims_array.data(), - max_dim, axis); + phi::funcs::GetBroadcastDimsArrays( + dim_x_batch, dim_tol, x_batch_dims_array.data(), + tol_dims_array.data(), out_dims_array.data(), max_dim, axis); ctx->SetOutputDim("Out", phi::make_ddim(out_dims_array)); } } else { @@ -114,141 +114,9 @@ class MatrixRankeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -template -void BatchEigenvalues(const T* x_data, T* eigenvalues_data, int batches, - int rows, int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, rows); - Eigen::SelfAdjointEigenSolver< - Eigen::Matrix> - eigen_solver(m); - auto eigenvalues = eigen_solver.eigenvalues().cwiseAbs(); - for (int j = 0; j < k; j++) { - *(eigenvalues_data + i * k + j) = eigenvalues[j]; - } - } -} - -template -void BatchSVD(const T* x_data, T* eigenvalues_data, int batches, int rows, - int cols, int k) { - // Eigen::Matrix API need non-const pointer. - T* input = const_cast(x_data); - int stride = rows * cols; - Eigen::BDCSVD< - Eigen::Matrix> - svd; - for (int i = 0; i < batches; i++) { - auto m = Eigen::Map< - Eigen::Matrix>( - input + i * stride, rows, cols); - svd.compute(m); - auto res_s = svd.singularValues(); - for (int j = 0; j < k; j++) { - eigenvalues_data[i * k + j] = res_s[j]; - } - } -} - -template -class MatrixRankCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - BatchEigenvalues(x_data, eigenvalue_data, batches, rows, cols, k); - } else { - BatchSVD(x_data, eigenvalue_data, batches, rows, cols, k); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations( - context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CPUDeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - - int axis = -1; - if (eigenvalue_tensor.dims().size() >= tol_tensor.dims().size()) { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - GreaterThanFunctor(), &compare_result); - } else { - ElementwiseComputeEx, - platform::CPUDeviceContext, T, int>( - context, &eigenvalue_tensor, &tol_tensor, axis, - LessThanFunctor(), &compare_result); - } - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(matrix_rank, ops::MatrixRankeOp, ops::MatrixRankeOpMaker); - -REGISTER_OP_CPU_KERNEL(matrix_rank, ops::MatrixRankCPUKernel, - ops::MatrixRankCPUKernel); diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu deleted file mode 100644 index b1800c9c0c9f8b68f7f62de42943f7a425fa0ddb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/matrix_rank_op.cu +++ /dev/null @@ -1,315 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver -#include -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/operators/matrix_rank_op.h" -#include "paddle/fluid/operators/svd_helper.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { -namespace detail { -DDim GetUDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 1] = k; - return phi::make_ddim(x_vec); -} - -DDim GetVHDDim(const DDim& x_dim, int k) { - auto x_vec = phi::vectorize(x_dim); - x_vec[x_vec.size() - 2] = k; - return phi::make_ddim(x_vec); -} -} // namespace detail - -template -class MatrixRankGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); - - const Tensor* x = context.Input("X"); - auto* x_data = x->data(); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - bool hermitian = context.Attr("hermitian"); - - auto dim_x = x->dims(); - auto dim_out = out->dims(); - int rows = dim_x[dim_x.size() - 2]; - int cols = dim_x[dim_x.size() - 1]; - int k = std::min(rows, cols); - auto numel = x->numel(); - int batches = numel / (rows * cols); - - bool use_default_tol = context.Attr("use_default_tol"); - const Tensor* atol_tensor = nullptr; - Tensor temp_tensor; - T rtol_T = 0; - if (use_default_tol) { - framework::TensorFromVector(std::vector{0}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); - } else if (context.HasInput("TolTensor")) { - atol_tensor = context.Input("TolTensor"); - } else { - framework::TensorFromVector(std::vector{context.Attr("tol")}, - context.device_context(), &temp_tensor); - atol_tensor = &temp_tensor; - } - - // Must Copy X once, because the gesvdj will destory the content when exit. - Tensor x_tmp; - paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp); - auto info = memory::Alloc(dev_ctx, sizeof(int) * batches); - int* info_ptr = reinterpret_cast(info->ptr()); - - Tensor eigenvalue_tensor; - auto* eigenvalue_data = eigenvalue_tensor.mutable_data( - detail::GetEigenvalueDim(dim_x, k), context.GetPlace()); - if (hermitian) { - SyevjBatched(dev_ctx, batches, rows, x_tmp.data(), eigenvalue_data, - info_ptr); - platform::ForRange for_range( - dev_ctx, eigenvalue_tensor.numel()); - phi::funcs::AbsFunctor functor(eigenvalue_data, eigenvalue_data, - eigenvalue_tensor.numel()); - for_range(functor); - } else { - Tensor U, VH; - auto* u_data = - U.mutable_data(detail::GetUDDim(dim_x, k), context.GetPlace()); - auto* vh_data = - VH.mutable_data(detail::GetVHDDim(dim_x, k), context.GetPlace()); - GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data(), vh_data, - u_data, eigenvalue_data, info_ptr, 1); - } - - auto dito_T = - math::DeviceIndependenceTensorOperations(context); - std::vector max_eigenvalue_shape = - phi::vectorize(detail::RemoveLastDim(eigenvalue_tensor.dims())); - Tensor max_eigenvalue_tensor = - dito_T.ReduceMax(eigenvalue_tensor, max_eigenvalue_shape); - Tensor temp_rtol_tensor; - framework::TensorFromVector(std::vector{rtol_T}, - context.device_context(), &temp_rtol_tensor); - Tensor rtol_tensor = dito_T.Mul(temp_rtol_tensor, max_eigenvalue_tensor); - Tensor tol_tensor; - tol_tensor.mutable_data(dim_out, context.GetPlace()); - ElementwiseComputeEx, platform::CUDADeviceContext, - T, T>(context, atol_tensor, &rtol_tensor, -1, - GreaterElementFunctor(), &tol_tensor); - - tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); - - Tensor compare_result; - compare_result.mutable_data(detail::NewAxisDim(dim_out, k), - context.GetPlace()); - int axis = -1; - ElementwiseComputeEx, - platform::CUDADeviceContext, T, int64_t>( - context, &eigenvalue_tensor, &tol_tensor, axis, - GreaterThanFunctor(), &compare_result); - auto dito_int = - math::DeviceIndependenceTensorOperations(context); - std::vector result_shape = phi::vectorize(dim_out); - Tensor result = dito_int.ReduceSum(compare_result, result_shape); - out->ShareDataWith(result); - } - - void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int m, int n, int k, T* A, T* U, T* V, T* S, int* info, - int thin_UV = 1) const; - - void SyevjBatched(const platform::CUDADeviceContext& dev_ctx, int batchSize, - int n, T* A, T* W, int* info) const; -}; - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, float* A, float* U, float* V, float* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::GesvdjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int m, int n, - int k, double* A, double* U, double* V, double* S, int* info, - int thin_UV) const { - // do not compute singular vectors - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - gesvdjInfo_t gesvdj_params = NULL; - int lda = m; - int ldu = m; - int ldt = n; - int lwork = 0; - auto handle = dev_ctx.cusolver_dn_handle(); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj_bufferSize( - handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, - gesvdj_params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - int stride_A = lda * n; - int stride_U = ldu * (thin_UV ? k : m); - int stride_V = ldt * (thin_UV ? k : n); - for (int i = 0; i < batchSize; ++i) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDgesvdj( - handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, - U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, - info, gesvdj_params)); - // check the error info - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, float* A, - float* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // matrix is saved as column-major in cusolver. - // numpy and torch use lower triangle to compute eigenvalues, so here use - // upper triangle - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); - float* workspace_ptr = reinterpret_cast(workspace->ptr()); - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -template <> -void MatrixRankGPUKernel::SyevjBatched( - const platform::CUDADeviceContext& dev_ctx, int batchSize, int n, double* A, - double* W, int* info) const { - auto handle = dev_ctx.cusolver_dn_handle(); - // Compute eigenvalues only - const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; - // upper triangle of A is stored - cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; - int lda = n; - int stride_A = lda * n; - int lwork = 0; - syevjInfo_t params = NULL; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnCreateSyevjInfo(¶ms)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj_bufferSize( - handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); - double* workspace_ptr = reinterpret_cast(workspace->ptr()); - - for (int i = 0; i < batchSize; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDsyevj( - handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, - lwork, info, params)); - int error_info; - memory::Copy(platform::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, - sizeof(int), dev_ctx.stream()); - PADDLE_ENFORCE_EQ( - error_info, 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, - error_info)); - } - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cusolverDnDestroySyevjInfo(params)); -} - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(matrix_rank, ops::MatrixRankGPUKernel, - ops::MatrixRankGPUKernel); -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index bd9ebd29777def2fafca648ad80bc57bef8df316..e55369e0691ee5e36da76c53c6dd5d13288231f4 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/maxout_op.h" #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { -using framework::Tensor; - class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -130,10 +130,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CPU_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h deleted file mode 100644 index 922998293943ed5ee1ebcd08b5bcd93467496cb9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/maxout_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxOutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - math::MaxOutFunctor maxout_forward; - maxout_forward(context.template device_context(), *in_x, out, - groups, axis); - } -}; - -template -class MaxOutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 3692ace8bb5a46b06bd10a07a5d5d95d8825bdc6..32ef052119883944abc1876f8bf3a8c028ddc57a 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::NotFound("Input (Out) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true, - platform::errors::NotFound( - "Input (Indices) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true, - platform::errors::NotFound( - "Input (Label) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true, - platform::errors::NotFound( - "Output (Accuracy) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true, - platform::errors::NotFound( - "Output (Correct) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true, - platform::errors::NotFound( - "Output (Total) of AccuracyOp is not found.")); - - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy", - "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy"); - - auto inference_dim = ctx->GetInputDim("Out"); - auto label_dim = ctx->GetInputDim("Label"); - // Assume indices has same shape as inference, because - // it's the output of topk. - - PADDLE_ENFORCE_EQ( - label_dim.size(), 2, - platform::errors::InvalidArgument( - "ShapeError: label's dimensions of AccuracyOp must be 2. " - "But received label's dimensions = %d, label's shape = [%s]", - label_dim.size(), label_dim)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(label_dim[1], 1, - platform::errors::InvalidArgument( - "ShapeError: label's second dimension of " - "AccuracyOp must be 1. But received label's " - "second dimension is = %d, label's shape = [%s]", - label_dim[1], label_dim)); - PADDLE_ENFORCE_EQ( - inference_dim[0], label_dim[0], - platform::errors::InvalidArgument( - "ShapeError: the output's num_rows of AccuracyOp must be" - " the same as label's num_rows. But received output's " - "shape = [%s], label's shape = [%s], output's num_rows = %d, " - "label's " - "num_rows = %d", - inference_dim, label_dim, inference_dim[0], label_dim[0])); - } - - ctx->SetOutputDim("Accuracy", {1}); - ctx->SetOutputDim("Correct", {1}); - ctx->SetOutputDim("Total", {1}); - ctx->ShareLoD("Out", /*->*/ "Accuracy"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -123,13 +62,13 @@ with the input Out(Inference). } // namespace operators } // namespace paddle +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. +DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor, + PD_INFER_META(phi::AccuracyInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR( accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -// FIXME(typhoonzero): types of T is for infernece data. -// label data is always int. -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel, - ops::AccuracyKernel); + paddle::framework::EmptyGradOpMaker, + AccuracyInferShapeFunctor); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu deleted file mode 100644 index 6f19100fa9d37e2efedad60a982bf19b09cac736..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/metrics/accuracy_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void AccuracyCudaKernel(const int N, const int D, - const int64_t* Xdata, - const int64_t* labeldata, int* correct_data, - float* accuracy, int* total_data) { - int count = 0; - __shared__ int total[BlockSize]; - - // support only 1 block - for (int i = threadIdx.x; i < (N); i += BlockSize) { - for (int j = 0; j < D; ++j) { - if (Xdata[i * D + j] == labeldata[i]) { - ++count; - break; - } - } - } - total[threadIdx.x] = count; - __syncthreads(); - -// reduce the count with init value 0, and output accuracy. -#ifdef PADDLE_WITH_CUDA - int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); -#else - // HIP thrust::reduce not support __device__ - for (int s = BlockSize / 2; s > 0; s >>= 1) { - if (threadIdx.x < s) { - total[threadIdx.x] += total[threadIdx.x + s]; - } - __syncthreads(); - } - int result = total[0]; -#endif - if (threadIdx.x == 0) { - *correct_data = result; - *accuracy = static_cast(result) / static_cast(N); - *total_data = N; - } -} - -template -class AccuracyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - // FIXME(typhoonzero): only support indices currently - // if add support for output values, how to detect the data type? - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - int num_samples = static_cast(inference->dims()[0]); - size_t infer_width = inference->dims()[1]; - auto stream = ctx.cuda_device_context().stream(); - platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); - - if (num_samples == 0) { - return; - } - - AccuracyCudaKernel< - PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - num_samples, infer_width, indices_data, label_data, correct_data, - accuracy_data, total_data); - } -}; - -} // namespace operators -} // namespace paddle - -// FIXME(typhoonzero): types of T is for inference data. -// label data is always int64 -REGISTER_OP_CUDA_KERNEL( - accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h deleted file mode 100644 index 94e5bf8257e67b9fd01aa9ae45a25d90963fef13..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AccuracyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - size_t num_samples = inference->dims()[0]; - size_t class_dim = inference->dims()[1]; - *accuracy_data = 0.0f; - - if (num_samples == 0) { - return; - } - - int num_correct = 0; - // assume inference is already the topk of the output - for (size_t i = 0; i < num_samples; ++i) { - PADDLE_ENFORCE_GE( - label_data[i], 0, - platform::errors::InvalidArgument( - "label of AccuracyOp must >= 0, But received label[%d] is %d", i, - label_data[i])); - for (size_t j = 0; j < class_dim; ++j) { - if (indices_data[i * class_dim + j] == label_data[i]) { - ++num_correct; - break; - } - } - } - - *correct_data = num_correct; - *total_data = num_samples; - *accuracy_data = - static_cast(num_correct) / static_cast(num_samples); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 2598d3b0277c94a52e1fa14b04c00b595071f312..1ce02ff4525c9692f88ed42b79ff336cc0113c41 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index 63bccc2e6e065a639c86a647894d2a0c124f0e54..9f2ca4165f33a28902bfe20207b12bad2af49fad 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -12,8 +12,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/controlflow/compare_op.h" -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index de71312d78df99adc3b3663f2fcbb3943373982e..3cc1be4de8a82ff263824ab4852178f735596d45 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -14,12 +14,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = paddle::framework::Tensor; template class AccuracyXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 2a3a0fa5d1fe50c93686c76571d812cab18c1d38..f3ed98c3f4d1e47a8b7dff81a998c7574859baa2 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/auc_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc"); - auto predict_dims = ctx->GetInputDim("Predict"); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_GE( - predict_dims.size(), 2, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape size must be " - "greater_equal 2.", - predict_dims)); - auto predict_width = predict_dims[1]; - PADDLE_ENFORCE_NE( - phi::product(predict_dims), 0, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape can not involes 0.", - predict_dims)); - PADDLE_ENFORCE_NE( - phi::product(label_dims), 0, - platform::errors::InvalidArgument( - "The Input(Label) has not been initialized properly. The " - "shape of Input(Label) = [%s], the shape can not involes 0.", - label_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_LE(predict_width, 2, - platform::errors::InvalidArgument( - "Only support binary classification," - "prediction dims[1] should be 1 or 2")); - } - auto predict_height = ctx->GetInputDim("Predict")[0]; - auto label_height = ctx->GetInputDim("Label")[0]; - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(predict_height, label_height, - platform::errors::InvalidArgument( - "Out and Label should have same height.")); - } - - int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; - int slide_steps = ctx->Attrs().Get("slide_steps"); - - PADDLE_ENFORCE_GE( - num_pred_buckets, 1, - platform::errors::InvalidArgument("num_thresholds must larger than 1")); - PADDLE_ENFORCE_GE(slide_steps, 0, - platform::errors::InvalidArgument( - "slide_steps must be natural number")); - - ctx->SetOutputDim("AUC", {1}); - - if (slide_steps) { - ctx->SetOutputDim("StatPosOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - ctx->SetOutputDim("StatNegOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - } else { - ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -145,5 +84,7 @@ There are two types of possible curves: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); -REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel); +DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor, + PD_INFER_META(phi::AucInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker, + AucInferShapeFunctor); diff --git a/paddle/fluid/operators/metrics/auc_op.cu b/paddle/fluid/operators/metrics/auc_op.cu deleted file mode 100644 index 1cb7eba8775e814b1150929de4a341c466ee4583..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/auc_op.cu +++ /dev/null @@ -1,232 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/auc_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -__global__ void ClearObsoleteDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] -= pos[cur_step_begin + i]; - neg[sum_step_begin + i] -= neg[cur_step_begin + i]; - pos[cur_step_begin + i] = neg[cur_step_begin + i] = 0; - } -} - -__global__ void UpdateSumDataKernel(int64_t *pos, int64_t *neg, - const int bucket_length, - const int slide_steps) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * bucket_length]) % slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - CUDA_KERNEL_LOOP(i, bucket_length) { - pos[sum_step_begin + i] += pos[cur_step_begin + i]; - neg[sum_step_begin + i] += neg[cur_step_begin + i]; - } -} - -template -__global__ void AddDataKernel(const int64_t *label_data, const T *pred_data, - const int inference_width, - const int num_thresholds, int64_t *pos, - int64_t *neg, const int numel, - const int slide_steps) { - int cur_step_begin = 0; - if (slide_steps > 0) { - int cur_step_index = - static_cast(pos[(slide_steps + 1) * (1 + num_thresholds)]) % - slide_steps; - cur_step_begin = cur_step_index * (1 + num_thresholds); - } - CUDA_KERNEL_LOOP(i, numel) { - auto predict_data = pred_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE(predict_data <= 1, "The predict data must less or equal 1."); - PADDLE_ENFORCE(predict_data >= 0, - "The predict data must gather or equal 0."); - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i]) { - paddle::platform::CudaAtomicAdd(pos + cur_step_begin + binIdx, 1); - } else { - paddle::platform::CudaAtomicAdd(neg + cur_step_begin + binIdx, 1); - } - } -} -__global__ void CalcAucKernel(int64_t *stat_pos, int64_t *stat_neg, - int num_thresholds, double *auc, - bool need_add_batch_num) { - *auc = 0.0f; - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += (totNeg - totNegPrev) * (totPos + totPosPrev) / 2.0; - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - if (need_add_batch_num) { - stat_pos[num_thresholds + 1] += 1; - stat_neg[num_thresholds + 1] += 1; - } -} - -template -class AucCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); -#ifdef PADDLE_WITH_CUDA - if (stat_pos_in_tensor != stat_pos) { - cudaMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - cudaMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - cudaMemcpyDeviceToDevice); - } -#else - if (stat_pos_in_tensor != stat_pos) { - hipMemcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } - if (stat_neg_in_tensor != stat_neg) { - hipMemcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t), - hipMemcpyDeviceToDevice); - } -#endif - - statAuc(ctx, label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - int sum_offset = slide_steps * (num_thresholds + 1); - auto stream = - ctx.template device_context().stream(); - CalcAucKernel<<<1, 1, 0, stream>>>( - origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value, slide_steps > 0); - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::ExecutionContext &ctx, - const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - auto stream = - ctx.template device_context().stream(); - if (slide_steps == 0) { - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - - ClearObsoleteDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - - AddDataKernel<<<(batch_size + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - label_data, inference_data, inference_width, num_thresholds, - origin_stat_pos, origin_stat_neg, batch_size, slide_steps); - UpdateSumDataKernel<<<(bucket_length + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - origin_stat_pos, origin_stat_neg, bucket_length, slide_steps); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(auc, - ops::AucCUDAKernel); diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h deleted file mode 100644 index 10403472c69b57723bc714703c115f07d8640f7e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/auc_op.h +++ /dev/null @@ -1,186 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AucKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *predict = ctx.Input("Predict"); - auto *label = ctx.Input("Label"); - - int num_thresholds = ctx.Attr("num_thresholds"); - int slide_steps = ctx.Attr("slide_steps"); - - // Only use output var for now, make sure it's persistable and - // not cleaned up for each batch. - auto *auc_tensor = ctx.Output("AUC"); - auto *stat_pos = ctx.Output("StatPosOut"); - auto *stat_neg = ctx.Output("StatNegOut"); - - auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); - auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *auc_value = auc_tensor->mutable_data(ctx.GetPlace()); - - // Just for pass UT, since UT's input & output connot be set same var - auto *stat_pos_in_tensor = ctx.Input("StatPos"); - auto *pos_in_data = stat_pos_in_tensor->data(); - auto *stat_neg_in_tensor = ctx.Input("StatNeg"); - auto *neg_in_data = stat_neg_in_tensor->data(); - if (stat_pos_in_tensor != stat_pos) { - memcpy(origin_stat_pos, pos_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - if (stat_neg_in_tensor != stat_neg) { - memcpy(origin_stat_neg, neg_in_data, - ((1 + slide_steps) * (num_thresholds + 1) + - (slide_steps > 0 ? 1 : 0)) * - sizeof(int64_t)); - } - statAuc(label, predict, num_thresholds, slide_steps, origin_stat_pos, - origin_stat_neg); - - int sum_offset = slide_steps * (num_thresholds + 1); - calcAuc(origin_stat_pos + sum_offset, origin_stat_neg + sum_offset, - num_thresholds, auc_value); - if (slide_steps) { - origin_stat_pos[(slide_steps + 1) * (num_thresholds + 1)] += 1; - origin_stat_neg[(slide_steps + 1) * (num_thresholds + 1)] += 1; - } - } - - private: - inline static double trapezoidArea(double X1, double X2, double Y1, - double Y2) { - return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; - } - - inline static void statAuc(const framework::Tensor *label, - const framework::Tensor *predict, - const int num_thresholds, const int slide_steps, - int64_t *origin_stat_pos, - int64_t *origin_stat_neg) { - size_t batch_size = predict->dims()[0]; - size_t inference_width = predict->dims()[1]; - const T *inference_data = predict->data(); - const auto *label_data = label->data(); - const int bucket_length = num_thresholds + 1; - if (slide_steps == 0) { - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[binIdx] += 1; - } - } - return; - } - // the last number of origin_stat_pos store the index should be used in - // current step - int cur_step_index = - static_cast(origin_stat_pos[(slide_steps + 1) * bucket_length]) % - slide_steps; - int cur_step_begin = cur_step_index * bucket_length; - int sum_step_begin = slide_steps * bucket_length; - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] -= - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] -= - origin_stat_neg[cur_step_begin + i]; - } - - std::memset(origin_stat_pos + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - std::memset(origin_stat_neg + cur_step_begin, 0, - bucket_length * sizeof(int64_t)); - - for (size_t i = 0; i < batch_size; i++) { - // if predict_data[i] has dim of 2, then predict_data[i][1] is pos prob - // if predict_data[i] has dim of 1, then predict_data[i][0] is pos prob - auto predict_data = - inference_data[i * inference_width + (inference_width - 1)]; - PADDLE_ENFORCE_LE(predict_data, 1, - platform::errors::PreconditionNotMet( - "The predict data must less or equal 1.")); - PADDLE_ENFORCE_GE(predict_data, 0, - platform::errors::PreconditionNotMet( - "The predict data must gather or equal 0.")); - - uint32_t binIdx = static_cast(predict_data * num_thresholds); - if (label_data[i] > 0) { - origin_stat_pos[cur_step_begin + binIdx] += 1; - } else if (label_data[i] == 0) { - origin_stat_neg[cur_step_begin + binIdx] += 1; - } - } - for (int i = 0; i < bucket_length; ++i) { - origin_stat_pos[sum_step_begin + i] += - origin_stat_pos[cur_step_begin + i]; - origin_stat_neg[sum_step_begin + i] += - origin_stat_neg[cur_step_begin + i]; - } - } - - inline static void calcAuc(const int64_t *stat_pos, const int64_t *stat_neg, - int num_thresholds, double *auc) { - *auc = 0.0f; - - double totPos = 0.0; - double totNeg = 0.0; - double totPosPrev = 0.0; - double totNegPrev = 0.0; - - int idx = num_thresholds; - - while (idx >= 0) { - totPosPrev = totPos; - totNegPrev = totNeg; - totPos += stat_pos[idx]; - totNeg += stat_neg[idx]; - *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); - --idx; - } - - if (totPos > 0.0 && totNeg > 0.0) { - *auc = *auc / totPos / totNeg; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc index 90e6a36220ab04087cd02abd76f6c3598425573c..812c55cdd5055186d7fd83a2057d88256f3b34a3 100644 --- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc @@ -150,4 +150,5 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel { // TODO(jczaja): Enable FP32 when performance is good namespace ops = paddle::operators; REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace, + ops::LayerNormMKLDNNOpKernel, ops::LayerNormMKLDNNOpKernel); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index ab02d4cfed9d54f9d168f6088df3e41d3e3e7c54..1078b451c55bae09c1274fe6ce3f45d21574d5e1 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -12,14 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { using framework::DataLayout; +using framework::Tensor; using dnnl::memory; using dnnl::pooling_backward; using dnnl::pooling_forward; @@ -83,11 +85,11 @@ class PoolingMKLDNNHandler phi::slice_ddim(input_dims, 2, input_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); const auto src_tz = phi::vectorize(input->dims()); const auto dst_tz = phi::vectorize(output->dims()); @@ -173,11 +175,11 @@ class PoolingMKLDNNHandler framework::DDim data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); if (global_pooling) { - operators::UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } - operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); auto src_tz = phi::vectorize(in_x->dims()); auto diff_src_tz = phi::vectorize(in_x_grad->dims()); diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc index 780c6e7f153e7b1179e203bc7807dd7818aa591a..a3b764b0e1c46ab91b989ed7f7b0b5df101f7654 100644 --- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc @@ -13,19 +13,32 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; template -class ShapeMKLDNNKernel : public ShapeKernel { +class ShapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ShapeKernel::Compute(ctx); + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } auto* out = ctx.Output("Out"); out->set_layout(framework::DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 2fdeecf89346fcf15f38b291ed5af49b8a2c8fc0..23428dd403e9b1ef62007c7b9193ed3b8482cab3 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -29,11 +29,11 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); -USE_OP(conv2d); +USE_OP_ITSELF(conv2d); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); namespace paddle { @@ -55,7 +55,7 @@ class CacheTester { onednn_dev_ctx_->ResetBlobMap(nullptr); } - bool Analyze(unsigned short int num_entries) { + bool Analyze(uint16_t num_entries) { // Number of created objects in cache should be as expected (num_entries) return onednn_dev_ctx_->GetCachedObjectsNumber() == num_entries; } diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index c776cf2a7c792c429fcf45a367d3f06bf9add5d2..4090d5ffca801512e423b02bfda3dd1a1bc49f03 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -24,14 +24,17 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); +PD_DECLARE_KERNEL(softmax, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 3791fed23a84ff51d022dd24a6a0734a39636a70..717af61b858dc16f9bdda20f530cbf06a09908eb 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -24,14 +24,18 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" -USE_OP(pool2d); +USE_OP_ITSELF(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +PD_DECLARE_KERNEL(pool2d, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc index 884521301750ce92c3f0a2e0b9468c5cc4a57790..6e3bd5e43c9c1d7e5c8a5dd4ba37afcfd7147e20 100644 --- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc +++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace fw = paddle::framework; namespace plat = paddle::platform; -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MLU); // relu diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc index 9de03582cbbf53e843e5f4531a6da6c1c2a87dd5..1fdaa153e3c27ed1a83696bf03d68dbfd2b93ae9 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.cc +++ b/paddle/fluid/operators/mlu/mlu_baseop.cc @@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_desc, output)); } +/* static */ void MLUCnnl::Concat(const MLUDeviceContext& dev_ctx, + const int pack_num, const int axis, + const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, + void* output) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size = 0; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetConcatWorkspaceSize(handle, pack_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlConcat(handle, pack_num, axis, inputs_desc, + inputs, workspace_ptr, workspace_size, + output_desc, output)); +} + /* static */ void MLUCnnl::Div( const ExecutionContext& ctx, cnnlComputationPreference_t prefer, const cnnlTensorDescriptor_t in0_desc, const void* in0, @@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() { output_descs, output_ptrs)); } +/* static */ void MLUCnnl::Split(const MLUDeviceContext& dev_ctx, int split_num, + int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]) { + cnnlHandle_t handle = dev_ctx.cnnl_handle(); + + size_t workspace_size; + PADDLE_ENFORCE_MLU_SUCCESS( + cnnlGetSplitWorkspaceSize(handle, split_num, &workspace_size)); + + Tensor workspace(paddle::experimental::DataType::INT8); + workspace.Resize(framework::DDim({static_cast(workspace_size)})); + void* workspace_ptr = workspace.mutable_data(dev_ctx.GetPlace()); + + PADDLE_ENFORCE_MLU_SUCCESS(cnnlSplit(handle, split_num, axis, input_desc, + input_ptr, workspace_ptr, workspace_size, + output_descs, output_ptrs)); +} + /* static */ void MLUCnnl::GatherFunctor( const ExecutionContext& ctx, const int axis, const int batch_dims, const cnnlTensorDescriptor_t params_desc, const void* params, diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 2a54a8392c7c5bfbf450ee9351a9fda866a07663..b55b10686e92e2b1b5b3a7390289f8329ac04a04 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -403,6 +403,11 @@ class MLUCnnl { const void* const inputs[], const cnnlTensorDescriptor_t output_desc, void* output); + static void Concat(const MLUDeviceContext& dev_ctx, const int pack_num, + const int axis, const cnnlTensorDescriptor_t inputs_desc[], + const void* const inputs[], + const cnnlTensorDescriptor_t output_desc, void* output); + static void Cast(const ExecutionContext& ctx, cnnlCastDataType_t cast_type, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t output_desc, void* output); @@ -566,6 +571,12 @@ class MLUCnnl { const cnnlTensorDescriptor_t output_descs[], void* output_ptrs[]); + static void Split(const MLUDeviceContext& dev_ctx, int split_num, int axis, + const cnnlTensorDescriptor_t input_desc, + const void* input_ptr, + const cnnlTensorDescriptor_t output_descs[], + void* output_ptrs[]); + static void Scale(const ExecutionContext& ctx, const int axis, const cnnlTensorDescriptor_t input_desc, const void* input, const cnnlTensorDescriptor_t alpha_desc, const void* alpha, diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu index afb949d3374c62f561e910ea77e516bdb4004ac0..2bacda8afb0eb340c4c8d4068f3013e2adbc7f91 100644 --- a/paddle/fluid/operators/mode_op.cu +++ b/paddle/fluid/operators/mode_op.cu @@ -24,7 +24,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/multi_dot_op.cc b/paddle/fluid/operators/multi_dot_op.cc index fe4609b3ad91e703fc28a997d5505d4cffa001a8..b309e1b87ef9033bd4302cdad4ea60a64cbf02eb 100644 --- a/paddle/fluid/operators/multi_dot_op.cc +++ b/paddle/fluid/operators/multi_dot_op.cc @@ -87,135 +87,6 @@ inline framework::DDim ComputeAndCheckShape( return out_dim; } -template -inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, - const framework::Tensor& matrix_a, - const framework::Tensor& matrix_b, - const framework::DDim& a_dim, - const framework::DDim& b_dim) { - auto place = ctx.GetPlace(); - auto blas = phi::funcs::GetBlas(ctx); - - framework::Tensor matrix_c; - framework::DDim c_dim = phi::make_ddim({a_dim[0], b_dim[1]}); - matrix_c.Resize(c_dim); - matrix_c.mutable_data(place); - - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, false); - const T alpha = static_cast(1.0); - blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0)); - return matrix_c; -} - -/** - * @brief Recursively calculate matrix multiplication according to the optimal - * order - * Let k = order[i,j], then ins[i...j] = ins[i...k] * ins[k+1 ...j] - * - * @param - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * save_result: set true by backward - * results: save the intermediate result during backward - */ -template -inline framework::Tensor MatChainMul( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, const uint64_t j, - const bool save_result, std::vector* results) { - if (i == j) { - return *ins[i]; - } - - const auto A = MatChainMul(ctx, ins, ins_dims, order, i, - order[i * ins.size() + j], - save_result, results); - framework::DDim a_dim = A.dims(); - if (i == order[i * ins.size() + j]) { - a_dim = ins_dims[i]; - } - - const auto B = MatChainMul(ctx, ins, ins_dims, order, - order[i * ins.size() + j] + 1, j, - save_result, results); - framework::DDim b_dim = B.dims(); - if (j == order[i * ins.size() + j] + 1) { - b_dim = ins_dims[j]; - } - - auto result = MatMul(ctx, A, B, a_dim, b_dim); - if (save_result) { - (*results)[i * ins.size() + j] = result; - } - return result; -} - -/** - * @brief get the optimal order - */ -std::vector GetOrder(const std::vector& ins, - const std::vector& ins_dims) { - auto n = ins.size(); - // p: save the ins shape, the ins[i] shape is (p[i], p[i+1]) - std::vector p(n + 1); - for (uint64_t i = 0; i < n; i++) { - p[i] = ins_dims[i][0]; - } - p[n] = ins_dims[n - 1][1]; - - // m[i, j]: save the lowest cost for multiplying ins[i...j] - std::vector m(n * n, 0); - // define ins[i...j] means multiplying matrices from ins[i] to ins[j] - // order[i, j] = k, this means that ins[i...k] and ins[k...j] fist and then - // multiply the resulting matrices is the optimal order for ins[i...j] - std::vector order(n * n); - for (uint64_t l = 1; l < n; l++) { - for (uint64_t i = 0; i < n - l; i++) { - auto j = i + l; - m[i * n + j] = 0xffffffff; - for (uint64_t k = i; k < j; k++) { - uint64_t q = - m[i * n + k] + m[(k + 1) * n + j] + p[i] * p[k + 1] * p[j + 1]; - if (q < m[i * n + j]) { - m[i * n + j] = q; - order[i * n + j] = k; - } - } - } - } - return order; -} - -template -static inline framework::Tensor MultiDotMatChainOrder( - const framework::ExecutionContext& ctx, - const std::vector& ins, - const std::vector& ins_dims, const bool save_result, - std::vector* results) { - auto order = GetOrder(ins, ins_dims); - return MatChainMul(ctx, ins, ins_dims, order, 0, - ins.size() - 1, save_result, results); -} - -inline void GetDims(const std::vector& ins, - std::vector* ins_dims) { - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - (*ins_dims)[i] = ins[i]->dims(); - if (i == 0 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({1, (*ins_dims)[i][0]}); - } else if (i == n - 1 && (*ins_dims)[i].size() == 1) { - (*ins_dims)[i] = phi::make_ddim({(*ins_dims)[i][0], 1}); - } - } -} - class MultiDotOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -252,78 +123,6 @@ class MultiDotOp : public framework::OperatorWithKernel { } }; -/** - * 1. there are only 2 matrices: direct matrix multiplication A*B - * 2. there are only 3 matrices: calculate the cost of (A*B)*C and A*(B*C), - * choose the least cost order for calculation - * 3. more than 3 matrices: call MultiDotMatChainOrder - */ -template -class MultiDotKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto ins = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); - - auto place = ctx.GetPlace(); - out->mutable_data(place); - - auto blas = phi::funcs::GetBlas(ctx); - - auto n = ins.size(); - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - const T scale = static_cast(1.0); - if (n == 2) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ma * Nb * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ma, Nb}); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); - } else { - framework::Tensor tmp_out; - tmp_out.mutable_data(place, Ka * Nc * sizeof(T)); - framework::DDim tmp_dim = phi::make_ddim({Ka, Nc}); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, - T(0)); - auto mat_dim_tmp = - phi::funcs::CreateMatrixDescriptor(tmp_dim, 0, false); - blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); - } - } else { - std::vector results; - const auto tmp = MultiDotMatChainOrder( - ctx, ins, ins_dims, false, &results); - auto out_dim = out->dims(); - *out = tmp; - out->Resize(out_dim); - } - } -}; - class MultiDotOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -341,180 +140,6 @@ class MultiDotOpGrad : public framework::OperatorWithKernel { } }; -template -class MultiDotGradKernel : public framework::OpKernel { - public: - /** - * @brief calculate dA and dB - * dA = dout * transpose(B) - * dB = transpose(A) * dout - */ - void CalcGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, const framework::Tensor& A, - const framework::Tensor& B, const framework::DDim& dout_dim, - const framework::DDim& a_dim, const framework::DDim& b_dim, - framework::Tensor* dA, framework::Tensor* dB) const { - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a_dim, 0, true); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b_dim, 0, true); - T alpha = static_cast(1.0); - auto blas = phi::funcs::GetBlas(ctx); - blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); - blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); - } - - /** - * @brief calculate multi matrix multiplication grad by a chain order - * @param - * dout: the grad of multi matrix multiplication out - * dx: the out grad of inputs - * ins: the input tensors - * ins_dims: the shape of ins after reshape - * order: the optimal order - * i: the left of sub chain - * j: the righe of sub chain - * results: the intermediate result of farward - */ - void MatChainMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor& dout, - std::vector* dx, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - const std::vector& order, const uint64_t i, - const uint64_t j, - const std::vector& results) const { - if (i == j) { - *((*dx)[i]) = dout; - return; - } - - const auto n = ins.size(); - const auto right = order[i * n + j]; - const auto left = order[i * n + j] + 1; - // get the multi result of left sub chain - const auto* A = &results[i * n + right]; - framework::DDim a_dim = A->dims(); - if (i == right) { - A = ins[i]; - a_dim = ins_dims[i]; - } - // get the multi result of right sub chain - const auto* B = &results[left * n + j]; - framework::DDim b_dim = B->dims(); - if (left == j) { - B = ins[j]; - b_dim = ins_dims[j]; - } - framework::Tensor dA, dB; - dA.Resize({dout_dim[0], b_dim[0]}); - dB.Resize({a_dim[1], dout_dim[1]}); - dA.mutable_data(ctx.GetPlace()); - dB.mutable_data(ctx.GetPlace()); - - CalcGrad(ctx, dout, *A, *B, dout_dim, a_dim, b_dim, &dA, &dB); - MatChainMulGrad(ctx, dA, dx, ins, dA.dims(), ins_dims, order, i, right, - results); - MatChainMulGrad(ctx, dB, dx, ins, dB.dims(), ins_dims, order, left, j, - results); - } - - void MultiDotGradMatChainOrder( - const framework::ExecutionContext& ctx, const framework::Tensor& dout, - const std::vector& ins, - const framework::DDim& dout_dim, - const std::vector& ins_dims, - std::vector* dx) const { - auto order = GetOrder(ins, ins_dims); - auto n = ins.size(); - std::vector results(n * n); - MatChainMul(ctx, ins, ins_dims, order, 0, n - 1, true, - &results); - MatChainMulGrad(ctx, dout, dx, ins, dout_dim, ins_dims, order, 0, n - 1, - results); - } - - void Compute(const framework::ExecutionContext& ctx) const { - auto ins = ctx.MultiInput("X"); - auto dout = *ctx.Input(framework::GradVarName("Out")); - auto dx = ctx.MultiOutput(framework::GradVarName("X")); - - auto blas = phi::funcs::GetBlas(ctx); - auto place = ctx.GetPlace(); - - const auto n = ins.size(); - for (size_t i = 0; i < n; i++) { - dx[i]->mutable_data(place); - } - - std::vector ins_dims(n); - GetDims(ins, &ins_dims); - - framework::DDim dout_dim = dout.dims(); - if (ins[0]->dims().size() == 1 && ins[n - 1]->dims().size() == 1) { - dout_dim = phi::make_ddim({1, 1}); - } else if (ins[0]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({1, dout_dim[0]}); - } - } else if (ins[n - 1]->dims().size() == 1) { - if (dout_dim.size() == 1) { - dout_dim = phi::make_ddim({dout_dim[0], 1}); - } - } - - T alpha = static_cast(1); - auto mat_dim_dout = phi::funcs::CreateMatrixDescriptor(dout_dim, 0, false); - if (n == 2) { - CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1], - dx[0], dx[1]); - } else if (n == 3) { - const auto Ma = ins_dims[0][0]; - const auto Ka = ins_dims[0][1]; - const auto Nb = ins_dims[1][1]; - const auto Nc = ins_dims[2][1]; - const uint64_t cost1 = Ma * Nb * (Ka + Nc); - const uint64_t cost2 = Ka * Nc * (Nb + Ma); - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false); - if (cost1 < cost2) { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ma, Nb}); - tmp_out.mutable_data(place); - tmp_dout.Resize({mat_dim_dout.height_, Nb}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, tmp_out, *ins[2], dout_dim, tmp_out.dims(), - ins_dims[2], &tmp_dout, dx[2]); - CalcGrad(ctx, tmp_dout, *ins[0], *ins[1], tmp_dout.dims(), ins_dims[0], - ins_dims[1], dx[0], dx[1]); - } else { - framework::Tensor tmp_out, tmp_dout; - tmp_out.Resize({Ka, Nc}); - tmp_out.mutable_data(place); - tmp_dout.Resize({Ka, mat_dim_dout.width_}); - tmp_dout.mutable_data(place); - blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, alpha, &tmp_out, - T(0)); - CalcGrad(ctx, dout, *ins[0], tmp_out, dout_dim, ins_dims[0], - tmp_dout.dims(), dx[0], &tmp_dout); - CalcGrad(ctx, tmp_dout, *ins[1], *ins[2], tmp_dout.dims(), ins_dims[1], - ins_dims[2], dx[1], dx[2]); - } - } else { - MultiDotGradMatChainOrder(ctx, dout, ins, dout_dim, ins_dims, &dx); - if (ins[n - 1]->dims().size() == 1) { - dx[n - 1]->Resize({dx[n - 1]->dims()[0]}); - } - } - } -}; - template class MultiDotOpGradMaker : public framework::SingleGradOpMaker { public: @@ -552,25 +177,3 @@ REGISTER_OPERATOR(multi_dot, ops::MultiDotOp, ops::MultiDotOpMaker, REGISTER_OPERATOR(multi_dot_grad, ops::MultiDotOpGrad, ops::MultiDotOpDoubleGradMaker, ops::MultiDotOpDoubleGradMaker); - -REGISTER_OP_CPU_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CPU_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - multi_dot, ops::MultiDotKernel, - ops::MultiDotKernel, - ops::MultiDotKernel); -REGISTER_OP_CUDA_KERNEL( - multi_dot_grad, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel, - ops::MultiDotGradKernel); -#endif diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc index 1143f9cb37aa54bea430d3a8bca8b62b02da4e2b..0113f638b9a47d161c890a0f547f8680af4018e7 100644 --- a/paddle/fluid/operators/multinomial_op.cc +++ b/paddle/fluid/operators/multinomial_op.cc @@ -53,8 +53,8 @@ class MultinomialOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, - PT_INFER_META(phi::MultinomialInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, + PD_INFER_META(phi::MultinomialInferMeta)); REGISTER_OPERATOR( multinomial, ops::MultinomialOp, ops::MultinomialOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc index ab9f10070fc60deab8974ae0e81e2b4c6cef2ffd..bf7222fc45c66085473eae627abe97b8a41d4268 100644 --- a/paddle/fluid/operators/mv_op.cc +++ b/paddle/fluid/operators/mv_op.cc @@ -16,8 +16,11 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -42,33 +45,6 @@ class MVOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "mv"); - OP_INOUT_CHECK(context->HasInput("Vec"), "Input", "Vec", "mv"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "mv"); - - auto dim_x = context->GetInputDim("X"); - auto dim_vec = context->GetInputDim("Vec"); - PADDLE_ENFORCE_EQ( - dim_x.size(), 2, - platform::errors::InvalidArgument( - "The rank of input X should be 2, but is %d", dim_x.size())); - PADDLE_ENFORCE_EQ( - dim_vec.size(), 1, - platform::errors::InvalidArgument( - "The rank of input Vec should be 1, but is %d", dim_vec.size())); - PADDLE_ENFORCE_EQ(dim_x[1], dim_vec[0], - platform::errors::InvalidArgument( - "X's second dimension is expected to be equal to " - "Vec's first dimension" - "but recieved X'shape = [%s], Vec's shape = [%s]", - dim_x, dim_vec)); - - framework::DDim dim_out = phi::make_ddim({dim_x[0]}); - - context->SetOutputDim("Out", dim_out); - context->ShareLoD("X", /*->*/ "Out"); - } }; template @@ -118,7 +94,11 @@ class MVOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(mv, MvInferShapeFunctor, + PD_INFER_META(phi::MvInferMeta)); + REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker, ops::MVOpGradMaker, - ops::MVOpGradMaker); + ops::MVOpGradMaker, + MvInferShapeFunctor); REGISTER_OPERATOR(mv_grad, ops::MVOpGrad); diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc index f510c7bebec876d034c1af923a4f7077c096000c..a4e1f7b3091a9f692e479300310333bfdd359096 100644 --- a/paddle/fluid/operators/nll_loss_op.cc +++ b/paddle/fluid/operators/nll_loss_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/nll_loss_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight", - "NLLLoss"); - - auto x_dims = ctx->GetInputDim("X"); - auto label_dims = ctx->GetInputDim("Label"); - auto reduction = ctx->Attrs().Get("reduction"); - - PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true, - platform::errors::InvalidArgument( - "The tensor rank of Input(X) must be 2 or 4.")); - bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) || - phi::contain_unknown_dim(label_dims); - bool check = ctx->IsRuntime() || !contain_unknown_dim; - if (check) { - PADDLE_ENFORCE_EQ( - x_dims[0], label_dims[0], - platform::errors::InvalidArgument( - "ShapeError: Expected input batch_size to match label batch_size," - "But received: the Input(x) batch_size is [%s], the Input(label) " - " batch_size is [%s].", - x_dims[0], label_dims[0])); - if (ctx->HasInput("Weight")) { - auto w_dims = ctx->GetInputDim("Weight"); - PADDLE_ENFORCE_EQ(w_dims.size(), 1, - platform::errors::InvalidArgument( - "Input(Weight) should be a 1D tensor.")); - PADDLE_ENFORCE_EQ( - x_dims[1], w_dims[0], - platform::errors::InvalidArgument( - "Expected input tensor Weight's size should equal " - "to the first dimension of the input tensor X. But received " - "Weight's " - "size is %d, the first dimension of input X is %d", - w_dims[0], x_dims[1])); - } - } - if (x_dims.size() == 2) { - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } else if (x_dims.size() == 4) { - PADDLE_ENFORCE_EQ(label_dims.size(), 3, - platform::errors::InvalidArgument( - "Expected Input(Lable) dimensions=3, received %d.", - label_dims.size())); - auto input0 = x_dims[0]; - auto input2 = x_dims[2]; - auto input3 = x_dims[3]; - auto label0 = label_dims[0]; - auto label1 = label_dims[1]; - auto label2 = label_dims[2]; - PADDLE_ENFORCE_EQ( - input0 == label0 && input2 == label1 && input3 == label2, true, - platform::errors::InvalidArgument("Input(X) tensor shape should " - "match to Input(Label) tensor " - "shape.")); - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } - ctx->SetOutputDim("Total_weight", {1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -259,15 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor, + PD_INFER_META(phi::NllLossRawInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker, ops::NLLLossGradMaker, - ops::NLLLossGradMaker); + ops::NLLLossGradMaker, + NllLossRawInferShapeFunctor); REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp); -REGISTER_OP_CPU_KERNEL( - nll_loss, ops::NLLLossOpKernel, - ops::NLLLossOpKernel); -REGISTER_OP_CPU_KERNEL( - nll_loss_grad, - ops::NLLLossGradOpKernel, - ops::NLLLossGradOpKernel); diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h deleted file mode 100644 index be6f4422d4ac6a475477c025c4b76eabdbf4f9e0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/nll_loss_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int64_t i = 0; i < batch_size; ++i) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "Label value is out of range. " - "Expected label value in range of [0, %d), but " - "received value is %d.", - n_classes, cur_label)); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight; - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int64_t i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= x_data[i * n_classes + cur_label] * cur_weight; - } - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[index] = -x_data[i * sample_size + cur_label * map_size + - h * in_dim3 + w] * - cur_weight; - } - } - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= - x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] * - cur_weight; - } - } - } - - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -class NLLLossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* out = ctx.Output("Out"); - auto* total_weight = ctx.Output("Total_weight"); - auto reduction = ctx.Attr("reduction"); - auto ignore_index = ctx.Attr("ignore_index"); - - auto x_data = x->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto out_data = out->mutable_data(ctx.GetPlace()); - auto total_weight_data = total_weight->mutable_data(ctx.GetPlace()); - *total_weight_data = 0; - - auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_1D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_2D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, in_dim2, in_dim3, - reduction, ignore_index); - } - } -}; - -template -static void nll_loss_grad_1D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight; - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[i * n_classes + cur_label] /= total_weight_val; - } - } -} - -template -static void nll_loss_grad_2D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] = - -cur_weight * dout_data[index]; - } - } - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - const auto dx_index = - i * sample_size + cur_label * map_size + h * in_dim3 + w; - dx_data[dx_index] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[dx_index] /= total_weight_val; - } - } - } - } -} - -template -class NLLLossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* total_weight = ctx.Input("Total_weight"); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto ignore_index = ctx.Attr("ignore_index"); - auto reduction = ctx.Attr("reduction"); - - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto total_weight_data = total_weight->data(); - memset(dx_data, 0, dx->numel() * sizeof(T)); - - const auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, in_dim2, - in_dim3, reduction, ignore_index); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index c400a8f4239a605414bf0d99a6a89b0ddae6c535..0ed1f2719de25bd2c138c23dd69b914a66961464 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( } template -void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, +void NormDoubleGradFunctor(const DeviceContext &ctx, const DataLayout data_layout, const Tensor *X, const Tensor *Scale, const Tensor *dY, const Tensor *Saved_mean, - const Tensor *Saved_variance, const double epsilon, + const Tensor *Saved_variance, const Tensor *Mean, + const Tensor *Variance, const double epsilon, const bool use_global_stats, const Tensor *ddX, const Tensor *ddScale, const Tensor *ddBias, Tensor *dX, Tensor *dScale, Tensor *ddY) { @@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data()); - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_constant; + phi::funcs::SetConstant set_constant; auto &x_dims = X->dims(); const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] @@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, Tensor scale_tmp; if (!Scale) { scale_tmp.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &scale_tmp, static_cast(1)); + set_constant(ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); #ifdef __HIPCC__ @@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, #else const int block = 512; #endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(C, max_blocks); int grid1 = (num + block - 1) / block; const T *mean_data, *variance_data; if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); + const auto *running_mean = Mean; + const auto *running_var = Variance; const auto *running_mean_data = running_mean->template data(); const auto *running_var_data = running_var->template data(); mean_data = running_mean_data; @@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } else { const T *smean_data = Saved_mean->data(); const T *svariance_data = Saved_variance->data(); + mean_data = smean_data; variance_data = svariance_data; } if (dX) { T *dx_data = dX->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dX, static_cast(0)); + set_constant(ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } else { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDX< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } else { DoubleGradComputeDX< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } @@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (dScale) { T *dscale_data = dScale->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dScale, static_cast(0)); + set_constant(ctx, dScale, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } else { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScale< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } else { DoubleGradComputeDScale< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } @@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (ddY) { T *ddy_data = ddY->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, ddY, static_cast(0)); + set_constant(ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } else { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDY< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } else { DoubleGradComputeDDY< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc index e212f4e7e2b7d1ad7964cc9351f1c4e241d5a79e..122b6a8a80aac95ab98ad95ed3e6339684978d12 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cc +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -12,9 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/one_hot_v2_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,26 +26,6 @@ namespace operators { class OneHotV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "one_hot_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "one_hot_v2"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 1, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 1.")); - - int depth = ctx->Attrs().Get("depth"); - if (ctx->HasInput("depth_tensor")) { - depth = -1; - } - - auto out_dims_vec = phi::vectorize(x_dims); - out_dims_vec.push_back(depth); - auto out_dims = phi::make_ddim(out_dims_vec); - ctx->SetOutputDim("Out", out_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -52,7 +36,7 @@ class OneHotV2Op : public framework::OperatorWithKernel { } framework::OpKernelType GetKernelTypeForVar( - const std::string& var_name, const Tensor& tensor, + const std::string& var_name, const framework::Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { if (var_name == "depth_tensor") { return expected_kernel_type; @@ -114,10 +98,12 @@ Out is a LoDTensor: } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(one_hot_v2, OneHotInferShapeFunctor, + PD_INFER_META(phi::OneHotRawInferMeta)); + REGISTER_OPERATOR( one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - one_hot_v2, ops::OneHotV2Kernel, - ops::OneHotV2Kernel); + paddle::framework::EmptyGradOpMaker, + OneHotInferShapeFunctor); diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu deleted file mode 100644 index 77e2a931e50de5b7775463fc7bbf6262e2ad4a53..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/one_hot_v2_op.cu +++ /dev/null @@ -1,100 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/one_hot_v2_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, - const int64_t numel, const int depth) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { - *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; - } -} - -template -struct OneHotV2OpCUDAFunctor { - const framework::LoDTensor* in_; - framework::LoDTensor* out_; - const DeviceContext& ctx_; - int depth_; - - OneHotV2OpCUDAFunctor(const framework::LoDTensor* in, - framework::LoDTensor* out, int depth, - const DeviceContext& ctx) - : in_(in), out_(out), depth_(depth), ctx_(ctx) {} - - template - void apply() const { - auto* p_in_data = in_->data(); - auto numel = in_->numel(); - auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); - auto stream = ctx_.stream(); - phi::funcs::set_constant(ctx_, out_, 0.0); - - FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - p_in_data, p_out_data, numel, depth_); - } -}; - -using LoDTensor = framework::LoDTensor; -template -class OneHotV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - - int depth = -1; - if (context.HasInput("depth_tensor")) { - auto* depth_tensor = context.Input("depth_tensor"); - if (platform::is_gpu_place(depth_tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*depth_tensor, platform::CPUPlace(), - &temp); - depth = *temp.data(); - } else { - depth = *depth_tensor->data(); - } - - auto out_dims = out->dims(); - out_dims[out_dims.size() - 1] = depth; - out->Resize(out_dims); - } else { - depth = context.Attr("depth"); - } - framework::VisitDataType( - static_cast( - context.Attr("dtype")), - OneHotV2OpCUDAFunctor( - in, out, depth, context.template device_context())); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - one_hot_v2, - ops::OneHotV2CUDAKernel, - ops::OneHotV2CUDAKernel); diff --git a/paddle/fluid/operators/one_hot_v2_op_npu.cc b/paddle/fluid/operators/one_hot_v2_op_npu.cc index acf6baf50b418ae0fd68d64f52f80f47df1c60c3..e5702a37bb2b4a4180e209bb5e306be64830bd99 100644 --- a/paddle/fluid/operators/one_hot_v2_op_npu.cc +++ b/paddle/fluid/operators/one_hot_v2_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/one_hot_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; template class OneHotV2NPUKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc index b96fcaa486cce8099cf1d03c7d948ea74c1923ad..372a71706ab5ec72b6da4cbac1b63333f42cb265 100644 --- a/paddle/fluid/operators/op_debug_string_test.cc +++ b/paddle/fluid/operators/op_debug_string_test.cc @@ -17,8 +17,10 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index ad7f93d73e902bbac684832d3a77ba83b517daf6..315831ddc0f290cc8c7ad1b78ce8625722f91d3b 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adadelta_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -23,77 +26,6 @@ class AdadeltaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, - platform::errors::InvalidArgument( - "Input(Param) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, - platform::errors::InvalidArgument( - "Input(Grad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredGrad"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredGrad) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("AvgSquaredUpdate"), true, - platform::errors::InvalidArgument( - "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front() == - framework::proto::VarType::LOD_TENSOR, - true, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - PADDLE_ENFORCE_EQ( - ctx->HasOutput("ParamOut"), true, - platform::errors::InvalidArgument( - "Output(ParamOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredGradOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("AvgSquaredUpdateOut"), true, - platform::errors::InvalidArgument( - "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.")); - - auto param_dim = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and grad input of AdadeltaOp should have same dimension.")); - PADDLE_ENFORCE_NE( - phi::product(ctx->GetInputDim("AvgSquaredGrad")), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable AvgSquaredGrad has not " - "been initialized. You may need to confirm if you put " - "exe.run(startup_program) after optimizer.minimize " - "function.")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), - platform::errors::InvalidArgument( - "Param and AvgSquaredGrad input of AdadeltaOp " - "should have same dimension")); - PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"), - platform::errors::InvalidArgument( - "Param and AvgSquaredUpdate input of AdadeltaOp " - "should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dim); - ctx->SetOutputDim("AvgSquaredGradOut", param_dim); - ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -149,7 +81,11 @@ $$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); -REGISTER_OP_CPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); +namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(adadelta, AdadeltaInferMetaFunctor, + PD_INFER_META(phi::AdadeltaInferMeta)); +REGISTER_OPERATOR( + adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdadeltaInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adadelta_op.h b/paddle/fluid/operators/optimizers/adadelta_op.h deleted file mode 100644 index 85cfad35858bbe6b112169f196c0711d981e9446..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/optimizers/adadelta_op.h +++ /dev/null @@ -1,84 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdadeltaOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto avg_squared_grad_out_tensor = - ctx.Output("AvgSquaredGradOut"); - auto avg_squared_update_out_tensor = - ctx.Output("AvgSquaredUpdateOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_grad_out_tensor->mutable_data(ctx.GetPlace()); - avg_squared_update_out_tensor->mutable_data(ctx.GetPlace()); - - T rho = static_cast(ctx.Attr("rho")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - // Squared gradient accumulator - auto avg_squared_grad = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredGrad")); - // Squared updates accumulator - auto avg_squared_update = framework::EigenVector::Flatten( - *ctx.Input("AvgSquaredUpdate")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto avg_squared_grad_out = - framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); - auto avg_squared_update_out = - framework::EigenVector::Flatten(*avg_squared_update_out_tensor); - auto& place = *ctx.template device_context().eigen_device(); - - avg_squared_grad_out.device(place) = - rho * avg_squared_grad + (1 - rho) * grad.square(); - auto update = - -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon)) - .sqrt() * - grad; - avg_squared_update_out.device(place) = - rho * avg_squared_update + (1 - rho) * update.square(); - param_out.device(place) = param + update; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index a95a37c980c8c9d41dc9fd352e3dace787a7c4e9..036839dd1300feac544a6f1ca661598f4360f745 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/optimizers/adamax_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -22,67 +25,6 @@ class AdamaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Param"), "Input", "Param", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Moment"), "Input", "Moment", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("InfNorm"), "Input", "InfNorm", "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate", - "Adamax"); - OP_INOUT_CHECK(ctx->HasInput("Beta1Pow"), "Input", "Beta1Pow", "Adamax"); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); - - OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("MomentOut"), "Output", "MomentOut", - "Adamax"); - OP_INOUT_CHECK(ctx->HasOutput("InfNormOut"), "Output", "InfNormOut", - "Adamax"); - - auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_NE(phi::product(lr_dims), 0, - platform::errors::InvalidArgument( - "Maybe the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.")); - PADDLE_ENFORCE_EQ(phi::product(lr_dims), 1, - platform::errors::InvalidArgument( - "Learning rate should have 1 dimension")); - auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(phi::product(beta1_pow_dims), 1, - platform::errors::InvalidArgument( - "Beta1 power accumulator should have 1 dimension")); - auto param_dims = ctx->GetInputDim("Param"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - platform::errors::InvalidArgument( - "Param and Grad input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment"), - platform::errors::InvalidArgument( - "Param and Moment input of AdamaxOp should have same dimension")); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("InfNorm"), - platform::errors::InvalidArgument( - "Param and InfNorm input of AdamaxOp should have same dimension")); - - ctx->SetOutputDim("ParamOut", param_dims); - ctx->SetOutputDim("MomentOut", param_dims); - ctx->SetOutputDim("InfNormOut", param_dims); - } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( @@ -150,7 +92,11 @@ division by 0 error. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); -REGISTER_OP_CPU_KERNEL( - adamax, ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +DECLARE_INFER_SHAPE_FUNCTOR(adamax, AdamaxInferMetaFunctor, + PD_INFER_META(phi::AdamaxInferMeta)); + +REGISTER_OPERATOR( + adamax, ops::AdamaxOp, ops::AdamaxOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + AdamaxInferMetaFunctor); diff --git a/paddle/fluid/operators/optimizers/adamax_op.h b/paddle/fluid/operators/optimizers/adamax_op.h deleted file mode 100644 index df0112448b1cbc82d699dc1ee6f3444bda3b142b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/optimizers/adamax_op.h +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -template -class AdamaxOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); - const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment_out_tensor = ctx.Output("MomentOut"); - auto inf_norm_out_tensor = ctx.Output("InfNormOut"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - moment_out_tensor->mutable_data(ctx.GetPlace()); - inf_norm_out_tensor->mutable_data(ctx.GetPlace()); - - T beta1 = static_cast(ctx.Attr("beta1")); - T beta2 = static_cast(ctx.Attr("beta2")); - T epsilon = static_cast(ctx.Attr("epsilon")); - - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment = framework::EigenVector::Flatten( - *ctx.Input("Moment")); - auto inf_norm = framework::EigenVector::Flatten( - *ctx.Input("InfNorm")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - auto beta1_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta1Pow")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto inf_norm_out = - framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto* place = ctx.template device_context().eigen_device(); - - moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; - inf_norm_out.device(*place) = - grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); - auto lr_t = lr / (1 - beta1_pow); - Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(*place) = - param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h index ab8b4f2b8f4d37d4be62c5e1dd040a1461d0bdee..a3fbb0e59e24e9be67da5048ebc644f08b385bbf 100644 --- a/paddle/fluid/operators/optimizers/cast_with_ptr.h +++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h @@ -57,8 +57,7 @@ static void LaunchCastKernel(const platform::CUDADeviceContext &ctx, PADDLE_ENFORCE_NE( static_cast(x), static_cast(y), platform::errors::InvalidArgument("Inplace cast is not supported yet.")); - int vec_size = - std::min(platform::GetVectorizedSize(x), platform::GetVectorizedSize(y)); + int vec_size = std::min(phi::GetVectorizedSize(x), phi::GetVectorizedSize(y)); switch (vec_size) { case 4: return details::VecCastKernel(ctx, x, y, n); diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index 8bb4606ffff151c6f65606d8dce156f98589a6b4..5b60f65442b55dc89a845859f153048e89704f70 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -19,11 +19,11 @@ #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h" #include "paddle/fluid/operators/optimizers/multi_tensor_apply.h" #include "paddle/fluid/operators/tensor_to_string.h" -#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/phi/core/utils/data_type.h" +#include "paddle/phi/kernels/funcs/aligned_vector.h" #ifdef __NVCC__ #include "cub/cub.cuh" @@ -66,8 +66,8 @@ struct L2NormFunctor { int i; for (i = threadIdx.x * VecSize; i + VecSize <= size; i += (BlockDim * VecSize)) { - platform::AlignedVector tmp_vec; - platform::Load(ptr + i, &tmp_vec); + phi::AlignedVector tmp_vec; + phi::Load(ptr + i, &tmp_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { auto tmp = static_cast(tmp_vec[j]); @@ -111,9 +111,9 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) { constexpr int max_load_bits = 128; int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); auto address = reinterpret_cast(ptr); - constexpr int vec8 = alignof(platform::AlignedVector); - constexpr int vec4 = alignof(platform::AlignedVector); - constexpr int vec2 = alignof(platform::AlignedVector); + constexpr int vec8 = alignof(phi::AlignedVector); + constexpr int vec4 = alignof(phi::AlignedVector); + constexpr int vec2 = alignof(phi::AlignedVector); chunk_size *= sizeof(T); if (address % vec8 == 0 && chunk_size % vec8 == 0) { return std::min(8, valid_vec_size); @@ -316,15 +316,15 @@ static __global__ void ScaleCUDAKernel(const T1 *__restrict__ x, int stride = blockDim.x * gridDim.x * VecSize; for (; i + VecSize <= num; i += stride) { - platform::AlignedVector x_vec; - platform::AlignedVector y_vec; + phi::AlignedVector x_vec; + phi::AlignedVector y_vec; - platform::Load(x + i, &x_vec); + phi::Load(x + i, &x_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { y_vec[j] = static_cast(static_cast(x_vec[j]) * s); } - platform::Store(y_vec, y + i); + phi::Store(y_vec, y + i); } for (; i < num; ++i) { @@ -410,24 +410,24 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( int stride = blockDim.x * gridDim.x * VecSize; for (; i + VecSize <= num; i += stride) { - platform::AlignedVector param_vec; - platform::AlignedVector grad_vec; - platform::AlignedVector mom1_vec; - platform::AlignedVector mom2_vec; - platform::AlignedVector trust_ratio_div_vec; + phi::AlignedVector param_vec; + phi::AlignedVector grad_vec; + phi::AlignedVector mom1_vec; + phi::AlignedVector mom2_vec; + phi::AlignedVector trust_ratio_div_vec; T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay; if (cur_weight_decay != static_cast(0.0)) { - platform::Load(param_p + i, ¶m_vec); + phi::Load(param_p + i, ¶m_vec); } else { #pragma unroll for (int j = 0; j < VecSize; ++j) { param_vec[j] = static_cast(0); } } - platform::Load(grad_p + i, &grad_vec); - platform::Load(mom1_p + i, &mom1_vec); - platform::Load(mom2_p + i, &mom2_vec); + phi::Load(grad_p + i, &grad_vec); + phi::Load(mom1_p + i, &mom1_vec); + phi::Load(mom2_p + i, &mom2_vec); #define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2, \ __trust_ratio_div, __idx) \ @@ -450,9 +450,9 @@ static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel( mom2_vec, trust_ratio_div_vec, j); } - platform::Store(mom1_vec, mom1_p + i); - platform::Store(mom2_vec, mom2_p + i); - platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i); + phi::Store(mom1_vec, mom1_p + i); + phi::Store(mom2_vec, mom2_p + i); + phi::Store(trust_ratio_div_vec, trust_ratio_div_p + i); } for (; i < num; ++i) { @@ -632,29 +632,29 @@ struct LambUpdateParamAndBetaPowsFunctor { trust_ratio_div += offset; for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) { - platform::AlignedVector trust_ratio_div_vec; - platform::Load(trust_ratio_div + i, &trust_ratio_div_vec); + phi::AlignedVector trust_ratio_div_vec; + phi::Load(trust_ratio_div + i, &trust_ratio_div_vec); if (HasMasterParam) { - platform::AlignedVector master_param_vec; - platform::Load(master_param + i, &master_param_vec); - platform::AlignedVector param_vec; + phi::AlignedVector master_param_vec; + phi::Load(master_param + i, &master_param_vec); + phi::AlignedVector param_vec; #pragma unroll for (int j = 0; j < VecSize; ++j) { MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j]; master_param_vec[j] = p; param_vec[j] = static_cast(p); } - platform::Store(master_param_vec, master_param + i); - platform::Store(param_vec, param + i); + phi::Store(master_param_vec, master_param + i); + phi::Store(param_vec, param + i); } else { - platform::AlignedVector param_vec; - platform::Load(param + i, ¶m_vec); + phi::AlignedVector param_vec; + phi::Load(param + i, ¶m_vec); #pragma unroll for (int j = 0; j < VecSize; ++j) { MT p = static_cast(param_vec[j]) - ratio * trust_ratio_div_vec[j]; param_vec[j] = static_cast(p); } - platform::Store(param_vec, param + i); + phi::Store(param_vec, param + i); } } diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index df5da1b79535cc6f5e4a638e9d32c367ea7cdb9f..fe5cd066864b82c734614e33869dff1734bee6d0 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -88,8 +88,8 @@ __device__ inline void VectorizeLarsUpdate( T* param_out, MT* velocity_out, const MT mu, MT local_lr, const MT lars_weight_decay, const MT rescale_grad, const int tid, const int grid_stride, const int numel, MT* master_param_out = nullptr) { - using VecType = paddle::platform::AlignedVector; - using VecMType = paddle::platform::AlignedVector; + using VecType = phi::AlignedVector; + using VecMType = phi::AlignedVector; int main = numel >> (VecSize >> 1); int tail_offset = main * VecSize; diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e5399ee36ba7ff4a983448d607c108db8870138c --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op_mlu.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +template +class MLUMergedMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto params = ctx.MultiInput("Param"); + auto params_out = ctx.MultiOutput("ParamOut"); + size_t n = params.size(); + PADDLE_ENFORCE_EQ(n, params_out.size(), + platform::errors::InvalidArgument( + "The size of Output(ParamOut) must be equal to " + "Input(Param), but got the size of Output(ParamOut) " + "is %d, the size of Input(Param) is %d.", + params_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(params[i], params_out[i], + platform::errors::InvalidArgument( + "The size of Input(Param) and Output(ParamOut) " + "must be the same Tensors.")); + } + + auto grads = ctx.MultiInput("Grad"); + PADDLE_ENFORCE_EQ( + n, grads.size(), + platform::errors::InvalidArgument( + "The size of Input(Grad) must be equal to Input(Param), but got " + "the size of Input(Grad) is %d, the size of Input(Param) is %d.", + grads.size(), n)); + + auto velocitys = ctx.MultiInput("Velocity"); + PADDLE_ENFORCE_EQ(n, velocitys.size(), + platform::errors::InvalidArgument( + "The size of Input(Velocity) must be equal to " + "Input(Param), but got the size of Input(Velocity) " + "is %d, the size of Input(Param) is %d.", + velocitys.size(), n)); + + auto velocitys_out = ctx.MultiOutput("VelocityOut"); + PADDLE_ENFORCE_EQ( + n, velocitys_out.size(), + platform::errors::InvalidArgument( + "The size of Output(VelocityOut) must be " + "equal to Input(Param), but got the size of Output(VelocityOut) is " + "%d, the size of Input(Param) is %d.", + velocitys_out.size(), n)); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + + auto mu = ctx.Attr("mu"); + auto lrs = ctx.MultiInput("LearningRate"); + if (lrs.size() != 1) { + PADDLE_ENFORCE_EQ( + n, lrs.size(), + platform::errors::InvalidArgument( + "If the size of Input(LearningRate) is not 1, the size of " + "Input(LearningRate) must be " + "equal to Input(Param), but got the size of Input(LearningRate) " + "is %d, the size of Input(Param) is %d.", + lrs.size(), n)); + } + auto use_nesterov = ctx.Attr("use_nesterov"); + auto regularization_methods = + ctx.Attr>("regularization_method"); + auto regularization_coeffs = + ctx.Attr>("regularization_coeff"); + if (regularization_methods.size() != 0) { + PADDLE_ENFORCE_EQ( + n, regularization_methods.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_method) must be equal " + "to Input(Param), but got the size of " + "Attr(regularization_method) is %d, the size of Input(Param) is " + "%d.", + regularization_methods.size(), n)); + PADDLE_ENFORCE_EQ( + n, regularization_coeffs.size(), + platform::errors::InvalidArgument( + "The size of Attr(regularization_coeff) must be equal " + "to Input(Param), but got the size of Attr(regularization_coeff) " + "is %d, the size of Input(Param) is %d.", + regularization_coeffs.size(), n)); + } + + VLOG(5) << "use_nesterov: " << use_nesterov + << ", regularization_methods.size(): " + << regularization_methods.size() + << ", regularization_coeffs.size(): " + << regularization_coeffs.size(); + + auto& dev_ctx = ctx.template device_context(); + + Tensor mu_tensor = ctx.AllocateTmpTensor({1}, dev_ctx); + MLUCnnlTensorDesc mu_tensor_desc(mu_tensor); + MLUCnnl::Fill(ctx, mu, mu_tensor_desc.get(), GetBasePtr(&mu_tensor)); + + for (size_t idx = 0; idx < n; ++idx) { + RegularizationType regularization_flag = + regularization_methods.size() > 0 && + regularization_methods[idx] == "l2_decay" + ? RegularizationType::kL2DECAY + : RegularizationType::kNONE; + T regularization_coeff = static_cast(0.0); + if (regularization_coeffs.size() != 0) { + regularization_coeff = static_cast(regularization_coeffs[idx]); + } + + auto learning_rate = lrs.size() > 1 ? lrs[idx] : lrs[0]; + auto param_out = params_out[idx]; + auto velocity_out = velocitys_out[idx]; + + auto grad = grads[idx]; + Tensor regularized_grad; + MLUCnnlTensorDesc param_desc(*param_out); + if (regularization_flag == RegularizationType::kL2DECAY) { + regularized_grad = ctx.AllocateTmpTensor( + param_out->dims(), dev_ctx); + MLUCnnlOpTensorDesc op_tensor_desc( + CNNL_OP_TENSOR_ADD, ToCnnlDataType(), CNNL_NOT_PROPAGATE_NAN); + MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), param_desc.get(), + GetBasePtr(param_out), param_desc.get(), + GetBasePtr(grad), param_desc.get(), + GetBasePtr(®ularized_grad), ToCnnlDataType(), + regularization_coeff); + } else { + regularized_grad = *grad; + } + MLUCnnl::ApplyMomentum(ctx, param_desc.get(), + GetBasePtr(®ularized_grad), use_nesterov, + GetBasePtr(learning_rate), GetBasePtr(&mu_tensor), + GetBasePtr(param_out), GetBasePtr(velocity_out)); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_MLU_KERNEL(merged_momentum, ops::MLUMergedMomentumOpKernel, + ops::MLUMergedMomentumOpKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.h b/paddle/fluid/operators/pad_constant_like_op.h index 5df167fdf726345074cdc40afd0c5b394467578f..0aedd800e1a237d4baf0092eef9bac9f7dbe862d 100644 --- a/paddle/fluid/operators/pad_constant_like_op.h +++ b/paddle/fluid/operators/pad_constant_like_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/padding.h" +#include "paddle/phi/kernels/funcs/padding.h" namespace paddle { namespace operators { @@ -50,8 +50,9 @@ class PadConstantLikeKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_x->dims()[j] - in_y->dims()[j]); } - math::PaddingFunctor(rank, context, pads, pad_value, - *in_y, out); + phi::funcs::PaddingFunctor( + rank, context.template device_context(), pads, pad_value, + *in_y, out); } }; @@ -82,8 +83,9 @@ class PadConstantLikeGradKernel : public framework::OpKernel { pads[j * 2 + 1] = static_cast(in_dout->dims()[j] - in_y->dims()[j]); } - math::PaddingGradFunctor(rank, context, pads, *in_dout, - d_y); + phi::funcs::PaddingGradFunctor( + rank, context.template device_context(), pads, *in_dout, + d_y); } }; diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 39acba7e58aba51942d7d8de2d89e2783fd591f9..dc162ae5782f2690fcf6378603268369e4aeb9ca 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pad_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad"); - - auto x_dim = ctx->GetInputDim("X"); - auto& paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE_EQ( - static_cast(paddings.size()), x_dim.size() * 2, - platform::errors::InvalidArgument( - "Size of 'paddings' dimension should be equal to 2 * size of " - "Input(X)'s dimension, but received (size of 'paddings' dimension " - "is) %d vs (2 * size of Input(X)'s dimension is) %d.", - static_cast(paddings.size()), x_dim.size() * 2)); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_GE(paddings[i], 0, - platform::errors::InvalidArgument( - "The element of 'paddings' should >= 0, but " - "received %d for index %d.", - paddings[i], static_cast(i))); - } - std::vector out_dims(x_dim.size()); - for (int i = 0; i < x_dim.size(); ++i) { - if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) { - out_dims[i] = -1; - } else { - out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; - } - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - if (out_dims[0] == x_dim[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - ctx->ShareLoD("X", /*->*/ "Out"); - } } }; @@ -160,47 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor, + PD_INFER_META(phi::PadInferMeta)); REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker, - ops::PadOpGradMaker); + ops::PadOpGradMaker, + PadInferShapeFunctor); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, ops::PadOpDoubleGradMaker, ops::PadOpDoubleGradMaker); -REGISTER_OP_CPU_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CPU_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); - -REGISTER_OP_CUDA_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel>, - ops::PadKernel>); -REGISTER_OP_CUDA_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel>, - ops::PadGradKernel>); diff --git a/paddle/fluid/operators/pad_op.h b/paddle/fluid/operators/pad_op.h deleted file mode 100644 index d494c954e1ef73b585761acf7490a5e35beccac4..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad_op.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/padding.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PadKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - float pad_value = context.Attr("pad_value"); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - out->mutable_data(context.GetPlace()); - - int rank = x->dims().size(); - math::PaddingFunctor(rank, context, pads, - static_cast(pad_value), *x, out); - } -}; - -template -class PadGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); - if (d_x == nullptr) { - return; - } - - d_x->mutable_data(context.GetPlace()); - int rank = d_out->dims().size(); - math::PaddingGradFunctor(rank, context, pads, *d_out, - d_x); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc index 2a127d9ad1db0c1e169fdd1e20a1568b99d228a0..21ca26f49f653d03e2710937d360091e0c4536df 100644 --- a/paddle/fluid/operators/pixel_shuffle_op.cc +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -124,8 +124,8 @@ class PixelShuffleGradOp : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, - PT_INFER_META(phi::PixelShuffleInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pixel_shuffle, PixelShuffleInferShapeFunctor, + PD_INFER_META(phi::PixelShuffleInferMeta)); REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, ops::PixelShuffleGradMaker, diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc index 0cecbf0b9cb027f7032b7b20fb10ef06a79503df..d5896c4105932ef7327d7093a15cf50e87308ae5 100644 --- a/paddle/fluid/operators/poisson_op.cc +++ b/paddle/fluid/operators/poisson_op.cc @@ -87,8 +87,8 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; -DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker, ops::PoissonOpInferVarType, diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc deleted file mode 100644 index 6335004e69a37109664940e4d3445e3694be9cc9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ /dev/null @@ -1,567 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/pool_op.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#ifdef PADDLE_WITH_HIP -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/operator.h" -#endif -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor; -using DataLayout = platform::DataLayout; -using PoolingMode = platform::PoolingMode; -template -using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; - -DataLayout getLayoutFromStr(std::string data_format) { - if (data_format == "NHWC") { - return DataLayout::kNHWC; - } else if (data_format == "NCHW") { - return DataLayout::kNCHW; - } else if (data_format == "NCDHW") { - return DataLayout::kNCDHW; - } else { - return DataLayout::kNCDHW; - } -} - -template -class PoolCUDNNOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - Tensor *output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - - // -----------------transformed tensor ------------------------ - - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - DataLayout layout; - - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans; - trans(dev_ctx, *input, &transformed_input, axis); - - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - } - - const T *tranformed_input_data = transformed_input.data(); - T *tranformed_output_data = transformed_output.mutable_data( - transformed_output.dims(), ctx.GetPlace()); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - pooling_mode = PoolingMode::kMaximum; - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, tranformed_output_data, - false, pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingForward( - handle, cudnn_pool_desc, &alpha, cudnn_input_desc, - tranformed_input_data, &beta, cudnn_output_desc, - tranformed_output_data)); -#endif - // add - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, transformed_output, output, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose trans; - trans(dev_ctx, transformed_output, output, axis); - } -#endif - } -}; - -template -class PoolCUDNNGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("Pool operator CUDA kernel must use " - "CUDAPlace rather than CPUPlace.")); - - const Tensor *input = ctx.Input("X"); - const Tensor *output = ctx.Input("Out"); - const Tensor *output_grad = - ctx.Input(framework::GradVarName("Out")); - Tensor *input_grad = ctx.Output(framework::GradVarName("X")); - - std::string pooling_type = ctx.Attr("pooling_type"); - bool exclusive = ctx.Attr("exclusive"); - bool adaptive = ctx.Attr("adaptive"); - std::vector ksize = ctx.Attr>("ksize"); - std::vector strides = ctx.Attr>("strides"); - std::vector paddings = ctx.Attr>("paddings"); - std::string data_format = ctx.Attr("data_format"); - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - -#ifdef PADDLE_WITH_HIP - if (pooling_type == "max") { - using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap; - using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc; - auto &all_op_kernels = - paddle::framework::OperatorWithKernel::AllOpKernels(); - std::string op_type = "pool2d_grad"; - auto kernels_iter = all_op_kernels.find(op_type); - PADDLE_ENFORCE_NE( - kernels_iter, all_op_kernels.end(), - platform::errors::Unavailable( - "There are no kernels which are registered in the %s operator.", - op_type)); - OpKernelMap &kernels = kernels_iter->second; - paddle::framework::OpKernelType expected_kernel_key( - paddle::framework::ToDataType(typeid(T)), ctx.GetPlace()); - auto kernel_iter = kernels.find(expected_kernel_key); - PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), - platform::errors::NotFound( - "Operator (%s) does not have kernel for %s.", - op_type, KernelTypeToString(expected_kernel_key))); - std::unique_ptr kernel_func_( - new OpKernelFunc(kernel_iter->second)); - (*kernel_func_)(ctx); - return; - } -#endif - - // update paddings - auto in_x_dims = input->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - // ------- tensor grad -------------- - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - Tensor transformed_output_grad(output_grad->type()); - - input_grad->mutable_data(ctx.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - DataLayout layout; - const std::string str_NCHW = "NCHW", str_NHWC = "NHWC"; - const std::string str_NCDHW = "NCDHW", str_NDHWC = "NDHWC"; - if (data_format == str_NDHWC) { - layout = DataLayout::kNCDHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 4, 1, 2, 3}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[4]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - in_dims_vec[4] = input->dims()[3]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans5; - trans5(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[4]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - out_dims_vec[4] = output->dims()[3]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans5_v2; - trans5_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans5_v3; - trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); - -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - } else if (data_format == str_NHWC) { - layout = DataLayout::kNCHW; - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 3, 1, 2}; - - // input - transformed_input.Resize(input->dims()); - auto in_dims_vec = phi::vectorize(input->dims()); - in_dims_vec[1] = input->dims()[3]; - in_dims_vec[2] = input->dims()[1]; - in_dims_vec[3] = input->dims()[2]; - transformed_input.Resize(phi::make_ddim(in_dims_vec)); - transformed_input.mutable_data(ctx.GetPlace(), input->type()); - - phi::funcs::Transpose trans4; - trans4(dev_ctx, *input, &transformed_input, axis); - - // output - transformed_output.Resize(output->dims()); - auto out_dims_vec = phi::vectorize(output->dims()); - out_dims_vec[1] = output->dims()[3]; - out_dims_vec[2] = output->dims()[1]; - out_dims_vec[3] = output->dims()[2]; - transformed_output.Resize(phi::make_ddim(out_dims_vec)); - - transformed_output.mutable_data(ctx.GetPlace(), output->type()); - - phi::funcs::Transpose - trans4_v2; - trans4_v2(dev_ctx, *output, &transformed_output, axis); - - // output grad - transformed_output_grad.Resize(phi::make_ddim(out_dims_vec)); - transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type()); - - phi::funcs::Transpose - trans4_v3; - trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis); - - // input grad - transformed_input_grad.Resize(phi::make_ddim(in_dims_vec)); -#endif - } else { - layout = getLayoutFromStr(data_format); - transformed_input = *input; - transformed_output = *output; - transformed_output_grad = *output_grad; - transformed_input_grad = *input_grad; - } - - const T *input_data = transformed_input.data(); - const T *output_data = transformed_output.data(); - const T *output_grad_data = transformed_output_grad.data(); - - // ------------------- cudnn descriptors --------------------- - ScopedTensorDescriptor input_desc; - ScopedTensorDescriptor output_desc; - ScopedPoolingDescriptor pool_desc; - -#ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#else - cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, phi::vectorize(transformed_input.dims())); - cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, phi::vectorize(transformed_output.dims())); -#endif - PoolingMode pooling_mode; - if (pooling_type == "max") { - if (FLAGS_cudnn_deterministic) { - pooling_mode = PoolingMode::kMaximumDeterministic; - } else { - pooling_mode = PoolingMode::kMaximum; - } - } else { - pooling_mode = exclusive ? PoolingMode::kAverageExclusive - : PoolingMode::kAverageInclusive; - } - -#ifdef PADDLE_WITH_HIP - miopenPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#else - cudnnPoolingDescriptor_t cudnn_pool_desc = - pool_desc.descriptor(pooling_mode, ksize, paddings, strides); -#endif - - // ------------------- cudnn pool algorithm --------------------- - auto handle = ctx.cuda_device_context().cudnn_handle(); - ScalingParamType alpha = 1.0f, beta = 0.0f; - if (input_grad) { - T *input_grad_data = transformed_input_grad.mutable_data( - transformed_input_grad.dims(), ctx.GetPlace()); -// Because beta is zero, it is unnecessary to reset input_grad. -#ifdef PADDLE_WITH_HIP - char *pool_workspace; - size_t pool_worksize = 0; - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::miopenPoolingGetWorkSpaceSizeV2( - cudnn_pool_desc, cudnn_output_desc, &pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(hipMalloc(&pool_workspace, pool_worksize)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data, pool_workspace)); - PADDLE_ENFORCE_GPU_SUCCESS(hipFree(pool_workspace)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnPoolingBackward( - handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data, - cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data, - &beta, cudnn_input_desc, input_grad_data)); -#endif - - if (data_format == str_NDHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 4, 1}; - phi::funcs::Transpose - trans5_v4; - trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#ifdef PADDLE_WITH_HIP - // MIOPEN not support NHWC data layout - if (data_format == str_NHWC) { - auto &dev_ctx = - ctx.template device_context(); - std::vector axis{0, 2, 3, 1}; - phi::funcs::Transpose - trans4_v4; - trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis); - } -#endif - } - } -}; - -template -class PoolCUDNNGradGradOpKernel : public PoolCUDNNOpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - std::string pooling_type = ctx.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolCUDNNOpKernel::Compute(ctx); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel); -#else -REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -REGISTER_OP_KERNEL(pool2d_grad_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel, - ops::PoolCUDNNGradGradOpKernel); - -REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel, - ops::PoolCUDNNOpKernel); -REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace, - ops::PoolCUDNNGradOpKernel, - ops::PoolCUDNNGradOpKernel); -#endif diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index ae095c2fa7aaa95cf667898b63a90988eb83caf0..44f3d8090e565c1581a49387db4b834b1abf8b62 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -15,6 +15,12 @@ limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" @@ -23,125 +29,6 @@ limitations under the License. */ namespace paddle { namespace operators { -int PoolOutputSize(int input_size, int filter_size, int padding_1, - int padding_2, int stride, bool ceil_mode) { - int output_size; - if (!ceil_mode) { - output_size = - (input_size - filter_size + padding_1 + padding_2) / stride + 1; - } else { - output_size = - (input_size - filter_size + padding_1 + padding_2 + stride - 1) / - stride + - 1; - } - PADDLE_ENFORCE_GT( - output_size, 0, - platform::errors::InvalidArgument( - "the output size must be greater than 0. But received: " - "output_size = %d due to the settings of input_size(%d), " - "padding(%d,%d), " - "k_size(%d) and stride(%d). Please check again!", - output_size, input_size, padding_1, padding_2, filter_size, stride)); - return output_size; -} - -void PoolOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of Pool operator is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of Pool operator is not found.")); - - std::string pooling_type = ctx->Attrs().Get("pooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool ceil_mode = ctx->Attrs().Get("ceil_mode"); - bool adaptive = ctx->Attrs().Get("adaptive"); - bool global_pooling = ctx->Attrs().Get("global_pooling"); - std::string data_format = ctx->Attrs().Get("data_format"); - std::string padding_algorithm = - ctx->Attrs().Get("padding_algorithm"); - - auto in_x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - in_x_dims.size() == 4 || in_x_dims.size() == 5, true, - platform::errors::InvalidArgument( - "the input of Op(pool) should be 4-D or 5-D Tensor. But " - "received: %u-D Tensor and it's shape is [%s].", - in_x_dims.size(), in_x_dims)); - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "the dimension of input minus the size of " - "Attr(ksize) must be euqal to 2 in Op(pool). " - "But received: the dimension of input minus the size " - "of Attr(ksize) is %d, the " - "input's dimension is %d, the shape of input " - "is [%s], the Attr(ksize)'s size is %d, the Attr(ksize) is [%s].", - in_x_dims.size() - ksize.size(), in_x_dims.size(), in_x_dims, - ksize.size(), phi::make_ddim(ksize))); - - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "the size of Attr(ksize) and Attr(strides) in " - "Op(pool) must be equal. " - "But received: Attr(ksize)'s size is %d, Attr(strides)'s " - "size is %d, Attr(ksize) is [%s], Attr(strides)is [%s].", - ksize.size(), strides.size(), phi::make_ddim(ksize), - phi::make_ddim(strides))); - - // MKL-DNN Kernels are using NCHW order of dims description - // so we ignore data_format consideration for MKL-DNN kernel - const bool channel_last = (ctx->IsRunMKLDNNKernel() == false) && - (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings if "SAME" or global_pooling - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - std::vector output_shape; - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (int i = 0; i < data_dims.size(); ++i) { - if ((!ctx->IsRuntime()) && (data_dims[i] < 0)) { - output_shape.push_back(data_dims[i]); - } else { - output_shape.push_back( - PoolOutputSize(data_dims[i], ksize[i], paddings[2 * i], - paddings[2 * i + 1], strides[i], ceil_mode)); - } - } - } - - // output_N = input_N - output_shape.insert(output_shape.begin(), in_x_dims[0]); - // output_C = input_C - if (channel_last) { - output_shape.push_back(in_x_dims[in_x_dims.size() - 1]); - } else { - output_shape.insert(output_shape.begin() + 1, in_x_dims[1]); - } - - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->ShareLoD("X", "Out"); -} - bool CanMKLDNNSupportPool(const framework::ExecutionContext& ctx) { if (ctx.Attr("adaptive") == false) return true; // (jczaja): oneDNN is supporting only unchangable in size pool window @@ -216,16 +103,6 @@ framework::OpKernelType PoolOp::GetKernelTypeForVar( tensor.place(), tensor.layout()); } -void PoolOpGrad::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::NotFound( - "Input(X) of Pool Gradoperator is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::NotFound( - "Input(X@GRAD) of Pool Gradoperator is not found.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); -} - framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library_{framework::LibraryType::kPlain}; @@ -471,7 +348,7 @@ class Pool2dOpGradGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("pool2d_grad_grad"); + grad_op->SetType("pool2d_double_grad"); grad_op->SetInput("X", this->OutputGrad(framework::GradVarName("X"))); grad_op->SetOutput("Out", this->InputGrad(framework::GradVarName("Out"))); grad_op->SetAttrMap(this->Attrs()); @@ -692,35 +569,34 @@ Example: namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pool2d, Pool2dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_grad, Pool2dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool2d_double_grad, + Pool2dDoubleGradInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); + REGISTER_OPERATOR( pool2d, ops::PoolOp, ops::Pool2dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); + paddle::framework::DefaultGradOpMaker, + Pool2dInferShapeFunctor); REGISTER_OPERATOR(pool2d_grad, ops::PoolOpGrad, ops::Pool2dOpGradGradMaker, - ops::Pool2dOpGradGradMaker); -REGISTER_OPERATOR(pool2d_grad_grad, ops::PoolOp); - -REGISTER_OP_CPU_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); -REGISTER_OP_CPU_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); + ops::Pool2dOpGradGradMaker, + Pool2dGradInferShapeFunctor); +REGISTER_OPERATOR(pool2d_double_grad, ops::PoolOp, + Pool2dDoubleGradInferShapeFunctor); + +DECLARE_INFER_SHAPE_FUNCTOR(pool3d, Pool3dInferShapeFunctor, + PD_INFER_META(phi::PoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(pool3d_grad, Pool3dGradInferShapeFunctor, + PD_INFER_META(phi::PoolGradInferMeta)); REGISTER_OPERATOR( pool3d, ops::PoolOp, ops::Pool3dOpMaker, ops::PoolOpInferVarType, paddle::framework::DefaultGradOpMaker, - paddle::framework::DefaultGradOpMaker); -REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad); - -REGISTER_OP_CPU_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL( - pool3d_grad, ops::PoolGradKernel, - ops::PoolGradKernel); + paddle::framework::DefaultGradOpMaker, + Pool3dInferShapeFunctor); +REGISTER_OPERATOR(pool3d_grad, ops::PoolOpGrad, Pool3dGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_op.cu b/paddle/fluid/operators/pool_op.cu deleted file mode 100644 index 069ce0c1fda853b943a7b414a7a33d9aa6405a89..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_op.cu +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - pool2d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool2d_grad_grad, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel, - ops::PoolGradGradKernel); - -REGISTER_OP_CUDA_KERNEL( - pool3d, ops::PoolKernel, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CUDA_KERNEL( - pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel, - ops::PoolGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index bea6506ee86dbfe3ac606a1e8e883bfbf2500f25..d48ac3bd358ef64271de69df4424399b427cfb82 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -12,19 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +// NOTE(Ruibiao): Difficult to remove code from this header file because too +// many files rely on it through "mkldnn_reuse.h" -#include -#include -#include +#pragma once -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" -#if defined(__HIPCC__) || defined(__NVCC__) -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#endif namespace paddle { namespace operators { @@ -35,8 +28,6 @@ class PoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -50,8 +41,6 @@ class PoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override; - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override; @@ -71,292 +60,5 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override; }; -template -inline void UpdatePadding(std::vector* paddings, const bool global_pooling, - const bool adaptive, - const std::string padding_algorithm, - const framework::DDim data_dims, - const std::vector& strides, - const std::vector& ksize) { - // set padding size == data_dims.size() * 2 - auto data_shape = phi::vectorize(data_dims); - if (static_cast(paddings->size()) == data_dims.size()) { - for (int i = 0; i < data_dims.size(); ++i) { - T copy_pad = *(paddings->begin() + 2 * i); - paddings->insert(paddings->begin() + 2 * i + 1, copy_pad); - } - } else { - PADDLE_ENFORCE_EQ(data_dims.size() * 2, paddings->size(), - platform::errors::InvalidArgument( - "Paddings size %d should be the same or twice as the " - "pooling size %d.", - paddings->size(), data_dims.size() * 2)); - } - - // when padding_algorithm is "VALID" or "SAME" - if (padding_algorithm == "SAME") { - for (int i = 0; i < data_dims.size(); ++i) { - T out_size = (data_dims[i] + strides[i] - 1) / strides[i]; - T pad_sum = - std::max((out_size - 1) * strides[i] + ksize[i] - data_shape[i], - static_cast(0)); - T pad_0 = pad_sum / 2; - T pad_1 = pad_sum - pad_0; - *(paddings->begin() + i * 2) = pad_0; - *(paddings->begin() + i * 2 + 1) = pad_1; - } - } else if (padding_algorithm == "VALID") { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } - - // if global_pooling == true or adaptive == true, padding will be ignore - if (global_pooling || adaptive) { - for (auto it = paddings->begin(); it != paddings->end(); it++) { - *it = 0; - } - } -} - -template -inline void UpdateKsize(std::vector* ksize, - const framework::DDim data_dims) { - ksize->resize(static_cast(data_dims.size())); - for (size_t i = 0; i < ksize->size(); ++i) { - *(ksize->begin() + i) = static_cast(data_dims[i]); - } -} - -inline int getReduceNum(const framework::Tensor& input, - const framework::Tensor* output, - const std::string data_format, - std::vector* reduce_dim) { - // data_format only can be NCHW - bool channel_last = (data_format == "NHWC"); - if (channel_last) { - return 0; - } - int reduce_num = 0; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; - if ((output_height == 1) && (output_width == 1)) { - reduce_dim->push_back(2); - reduce_dim->push_back(3); - reduce_num = input.dims()[2] * input.dims()[3]; - } - return reduce_num; -} - -template -class PoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - std::string data_format = context.Attr("data_format"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - auto& dev_ctx = context.template device_context(); - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool2d_forward; - paddle::operators::math::MaxPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - std::vector reduce_dim; - int reduce_num = getReduceNum(*in_x, out, data_format, &reduce_dim); - if (reduce_num > 0 && - adaptive) { // for adaptive_avg_pool2d && output_size == 1 -#if defined(__HIPCC__) || defined(__NVCC__) - auto stream = dev_ctx.stream(); - TensorReduceImpl>( - dev_ctx, *in_x, out, kps::DivideFunctor(reduce_num), - reduce_dim, stream); -#else // for cpu - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); -#endif - } else { // avgpool_2d or adaptive_avg_pool2d && output_size != 1 - paddle::operators::math::Pool2dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool2d_forward; - paddle::operators::math::AvgPool pool_process; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, - data_format, exclusive, adaptive, out, pool_process); - } - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::MaxPool, T> - pool3d_forward; - paddle::operators::math::MaxPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - true, false, out, pool_process); - - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dFunctor< - DeviceContext, paddle::operators::math::AvgPool, T> - pool3d_forward; - paddle::operators::math::AvgPool pool_process; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, data_format, - exclusive, adaptive, out, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class PoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::string pooling_type = context.Attr("pooling_type"); - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool exclusive = context.Attr("exclusive"); - bool adaptive = context.Attr("adaptive"); - std::string data_format = context.Attr("data_format"); - bool global_pooling = context.Attr("global_pooling"); - std::string padding_algorithm = - context.Attr("padding_algorithm"); - - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // update paddings - auto in_x_dims = in_x->dims(); - framework::DDim data_dims; - if (channel_last) { - data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); - } else { - data_dims = phi::slice_ddim(in_x_dims, 2, in_x_dims.size()); - } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); - if (data_dims.size() * 2 == static_cast(paddings.size())) { - for (int i = 0; i < data_dims.size(); ++i) { - paddings.erase(paddings.begin() + i + 1); - } - } - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - auto& dev_ctx = context.template device_context(); - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_constant; - set_constant(dev_ctx, in_x_grad, static_cast(0.0)); - - switch (ksize.size()) { - case 2: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor - pool2d_backward; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool2dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool2d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - case 3: { - if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor - pool3d_backward; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, in_x_grad); - } else if (pooling_type == "avg") { - paddle::operators::math::Pool3dGradFunctor< - DeviceContext, paddle::operators::math::AvgPoolGrad, T> - pool3d_backward; - paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, data_format, exclusive, adaptive, - in_x_grad, pool_process); - } - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; - -template -class PoolGradGradKernel : public PoolKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::string pooling_type = context.Attr("pooling_type"); - if (pooling_type == "max") { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op grad grad only supports avgpool.")); - } else { - PoolKernel::Compute(context); - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc index 08656e64231b61181583cb700f2cc3216e25e516..fa88d128a9a1d572414a6459933a8988cae1fda0 100644 --- a/paddle/fluid/operators/pool_op_mlu.cc +++ b/paddle/fluid/operators/pool_op_mlu.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -80,10 +81,10 @@ class MLUPoolOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } MLUCnnlTensorDesc in_x_desc(*in_x, cnnl_layout, ToCnnlDataType()); @@ -191,10 +192,10 @@ class MLUPoolGradOpKernel : public framework::OpKernel { data_dims = phi::slice_ddim(in_x_dims, 1, in_x_dims.size() - 1); } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); if (global_pooling) { - UpdateKsize(&ksize, data_dims); + phi::funcs::UpdateKernelSize(&ksize, data_dims); } // inputs need with NHWC layout diff --git a/paddle/fluid/operators/pool_op_npu.cc b/paddle/fluid/operators/pool_op_npu.cc index bd26d6350d9c300949edb1a90b244a7c747dd7a9..0efcb8b7981c32e9f8d5a04f4fd4122d6725a49e 100644 --- a/paddle/fluid/operators/pool_op_npu.cc +++ b/paddle/fluid/operators/pool_op_npu.cc @@ -11,8 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -68,8 +70,8 @@ class NPUPoolOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], platform::errors::InvalidArgument( @@ -201,8 +203,8 @@ class NPUPoolGradOpKernel : public framework::OpKernel { strides_vec[2] = strides[0]; strides_vec[3] = strides[1]; } - UpdatePadding(&paddings, global_pooling, adaptive, padding_algorithm, - data_dims, strides, ksize); + phi::funcs::UpdatePadding(&paddings, global_pooling, adaptive, + padding_algorithm, data_dims, strides, ksize); PADDLE_ENFORCE_LT( std::max(paddings[0], paddings[1]), ksize[0], diff --git a/paddle/fluid/operators/pool_op_xpu.cc b/paddle/fluid/operators/pool_op_xpu.cc index 402dd6c10803947f73e593d215d28246a81c6706..87c437d8a78e0122b0fc4f5a7dbf51612e40fbf2 100644 --- a/paddle/fluid/operators/pool_op_xpu.cc +++ b/paddle/fluid/operators/pool_op_xpu.cc @@ -8,13 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_op.h" + #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #ifdef PADDLE_WITH_XPU namespace paddle { namespace operators { +using framework::Tensor; + xpu::Pooling_t XPUPoolingType(const std::string& pooltype, bool exclusive, bool is_test) { if (pooltype == "max") { diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index e0c24935b47509dbe473a963240f4234e168a293..e0341f4a4b4716d0ee82c9437ddc4d8bd1e35fb2 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -12,8 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pool_with_index_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,67 +32,6 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of Pooling should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Mask"), true, - platform::errors::InvalidArgument( - "Output(Mask) of Pooling should not be null.")); - - auto in_x_dims = ctx->GetInputDim("X"); - - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - bool adaptive = ctx->Attrs().Get("adaptive"); - - PADDLE_ENFORCE( - in_x_dims.size() == 4 || in_x_dims.size() == 5, - platform::errors::InvalidArgument("Pooling intput should be 4-D or 5-D " - "tensor but received %dD-Tensor", - in_x_dims.size())); - - if (ctx->Attrs().Get("global_pooling")) { - ksize.resize(static_cast(in_x_dims.size()) - 2); - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_dims[i + 2]); - } - } - - PADDLE_ENFORCE_EQ( - in_x_dims.size() - ksize.size(), 2U, - platform::errors::InvalidArgument( - "The input size %d minus the kernel size %d should equal to 2.", - in_x_dims.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), strides.size(), - platform::errors::InvalidArgument( - "Strides size %d and pooling size %d should be the same.", - strides.size(), ksize.size())); - PADDLE_ENFORCE_EQ( - ksize.size(), paddings.size(), - platform::errors::InvalidArgument( - "Paddings size %d and pooling size %d should be the same.", - paddings.size(), ksize.size())); - - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - if (adaptive) { - output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); - } else { - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); - } - } - ctx->SetOutputDim("Out", phi::make_ddim(output_shape)); - ctx->SetOutputDim("Mask", phi::make_ddim(output_shape)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -102,22 +45,6 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Mask"), true, - platform::errors::InvalidArgument("Input(Mask) must not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument("Input(X) must not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "Output(X@GRAD) should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -331,40 +258,34 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER( namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index, + MaxPool2dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool2d_with_index_grad, + MaxPool2dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); + REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool2dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool2dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool2dWithIndexGradInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index, + MaxPool3dWithIndexInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(max_pool3d_with_index_grad, + MaxPool3dWithIndexGradInferShapeFunctor, + PD_INFER_META(phi::MaxPoolWithIndexGradInferMeta)); REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, ops::MaxPoolWithIndexGradOpMaker, - ops::MaxPoolWithIndexGradOpMaker); + ops::MaxPoolWithIndexGradOpMaker, + MaxPool3dWithIndexInferShapeFunctor); REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CPU_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, + MaxPool3dWithIndexGradInferShapeFunctor); diff --git a/paddle/fluid/operators/pool_with_index_op.cu.cc b/paddle/fluid/operators/pool_with_index_op.cu.cc deleted file mode 100644 index 5497dcbd9ce255f833df24989d7a76c40bcbca06..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_with_index_op.cu.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pool_with_index_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); - -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_CUDA_KERNEL( - max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel); diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h deleted file mode 100644 index 6e51a833f5c89efc2621c0ccc3d08dc42b2733a1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pool_with_index_op.h +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxPoolWithIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - Tensor* mask = context.Output("Mask"); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - - auto& dev_ctx = context.template device_context(); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x->dims()[i + 2]); - } - } - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor - pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor - pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, - mask); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } -}; - -template -class MaxPoolWithIndexGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* mask = context.Input("Mask"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - - std::vector ksize = context.Attr>("ksize"); - std::vector strides = context.Attr>("strides"); - std::vector paddings = context.Attr>("paddings"); - bool adaptive = context.Attr("adaptive"); - if (context.Attr("global_pooling")) { - for (size_t i = 0; i < ksize.size(); ++i) { - paddings[i] = 0; - ksize[i] = static_cast(in_x_grad->dims()[i + 2]); - } - } - - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.template device_context(); - phi::funcs::set_constant(device_ctx, in_x_grad, 0); - - switch (ksize.size()) { - case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor - pool2d_backward; - pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor - pool3d_backward; - pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, adaptive, in_x_grad); - } break; - default: { - PADDLE_THROW(platform::errors::InvalidArgument( - "Pool op only supports 2D and 3D input.")); - } - } - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index da637dfeb237dd4f17816e784882720dc2f2ff64..cfacffff234105ac9c6dc41b86f06594d319dcbb 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/psroi_pool_op.h" -#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -82,75 +82,6 @@ class PSROIPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, - platform::errors::InvalidArgument( - "Input(ROIs) of PSROIPoolOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of PSROIPoolOp should not be null.")); - auto input_dims = ctx->GetInputDim("X"); - auto rois_dims = ctx->GetInputDim("ROIs"); - - PADDLE_ENFORCE_EQ(input_dims.size(), 4, - platform::errors::InvalidArgument( - "The format of input tensor is NCHW")); - PADDLE_ENFORCE_EQ( - rois_dims.size(), 2, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - PADDLE_ENFORCE_EQ( - rois_dims[1], 4, - platform::errors::InvalidArgument( - "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " - "given as [(x1, y1, x2, y2), ...]")); - if (ctx->HasInput("RoisNum")) { - auto rois_num_dims = ctx->GetInputDim("RoisNum"); - PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, - platform::errors::InvalidArgument( - "The second dimension of RoisNum should " - "be 1, but received dimension is %d", - rois_num_dims.size())); - } - int pooled_height = ctx->Attrs().Get("pooled_height"); - int pooled_width = ctx->Attrs().Get("pooled_width"); - int output_channels = ctx->Attrs().Get("output_channels"); - float spatial_scale = ctx->Attrs().Get("spatial_scale"); - - PADDLE_ENFORCE_EQ( - input_dims[1], output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channel of X(%d) " - "should be equal to the product of " - "output_channels(%d), pooled_height(%d) and pooled_width(%d)", - input_dims[1], output_channels, pooled_height, pooled_width)); - - PADDLE_ENFORCE_GT(pooled_height, 0, - platform::errors::InvalidArgument( - "The pooled output height must be greater than 0")); - PADDLE_ENFORCE_GT(pooled_width, 0, - platform::errors::InvalidArgument( - "The pooled output width must be greater than 0")); - PADDLE_ENFORCE_GT(output_channels, 1, - platform::errors::InvalidArgument( - "The pooled output channels must greater than 1")); - PADDLE_ENFORCE_GT(spatial_scale, 0.0f, - platform::errors::InvalidArgument( - "The spatial scale must greater than 0.")); - - auto out_dims = input_dims; - out_dims[0] = rois_dims[0]; - out_dims[1] = - output_channels; // input_dims[1] / (pooled_height * pooled_width); - out_dims[2] = pooled_height; - out_dims[3] = pooled_width; - ctx->SetOutputDim("Out", out_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,16 +95,6 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "The gradient of Out should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, - platform::errors::InvalidArgument( - "The gradient of X should not be null.")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -204,15 +125,13 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool, PsroiPoolInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(psroi_pool_grad, PsroiPoolGradInferShapeFunctor, + PD_INFER_META(phi::PsroiPoolGradInferMeta)); REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, ops::PSROIPoolGradMaker, - ops::PSROIPoolGradMaker); -REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); -REGISTER_OP_CPU_KERNEL( - psroi_pool, - ops::CPUPSROIPoolOpKernel, - ops::CPUPSROIPoolOpKernel); -REGISTER_OP_CPU_KERNEL( - psroi_pool_grad, - ops::CPUPSROIPoolGradOpKernel, - ops::CPUPSROIPoolGradOpKernel); + ops::PSROIPoolGradMaker, + PsroiPoolInferShapeFunctor); +REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp, + PsroiPoolGradInferShapeFunctor); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu deleted file mode 100644 index c1917501db8b5afebf4b7951b0f04de69758b49d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ /dev/null @@ -1,350 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/psroi_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; - -static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaximumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaximumNumBlocks); -} - -template -__global__ void GPUPSROIPoolForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* output_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (size_t i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - const T* offset_input_data = - input_data + - (roi_batch_id * input_channels + input_channel) * height * width; - T outsum = 0; - - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - outsum += offset_input_data[input_index]; - } - } - - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - output_data[i] = is_empty ? 0. : outsum / bin_area; - } -} - -template -__global__ void GPUPSROIPoolBackward( - const int nthreads, const T* input_rois, const T* output_grad_data, - const float spatial_scale, const int input_channels, const int height, - const int width, const int output_channels, const int pooled_height, - const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = min(max(hstart, 0), height); - hend = min(max(hend, 0), height); - wstart = min(max(wstart, 0), width); - wend = min(max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val); - } - } - } -} - -template -class GPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - PADDLE_ENFORCE_EQ( - input_channels, output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "The channels %d of input X should equal the product of " - "output_channels %d x pooled_height %d x pooled_width %d.", - input_channels, output_channels, pooled_height, pooled_width)); - - int rois_num = rois->dims()[0]; - if (rois_num == 0) return; - int rois_batch_size; - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_data, sizeof(int) * rois_batch_size, 0); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_list[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - - // set rois batch id - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; - - // call cuda kernel function - GPUPSROIPoolForward< - T><<>>( - output_size, in->data(), rois->data(), spatial_scale, - input_channels, height, width, output_channels, pooled_height, - pooled_width, rois_batch_id_list_gpu.data(), - out->mutable_data(ctx.GetPlace())); - } -}; - -template -class GPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - int rois_num = rois->dims()[0]; - int input_channels = in->dims()[1]; - int height = in->dims()[2]; - int width = in->dims()[3]; - - if (input_grad) { - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(platform::CPUPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - std::vector rois_num_list(rois_batch_size); - memory::Copy(platform::CPUPlace(), rois_num_list.data(), ctx.GetPlace(), - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - framework::Tensor rois_batch_id_list_gpu; - framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), - ctx.device_context(), &rois_batch_id_list_gpu); - - input_grad->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant set_zero; - set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); - - int output_grad_size = output_grad->numel(); - int blocks = NumBlocks(output_grad_size); - int threads = kNumCUDAThreads; - - if (output_grad_size > 0) { - GPUPSROIPoolBackward< - T><<>>( - output_grad_size, rois->data(), output_grad->data(), - spatial_scale, input_channels, height, width, output_channels, - pooled_height, pooled_width, rois_batch_id_list_gpu.data(), - input_grad->mutable_data(ctx.GetPlace())); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - psroi_pool, - ops::GPUPSROIPoolOpKernel, - ops::GPUPSROIPoolOpKernel); -REGISTER_OP_CUDA_KERNEL( - psroi_pool_grad, - ops::GPUPSROIPoolGradOpKernel, - ops::GPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h deleted file mode 100644 index 3f020d93391b0e648898c1b83858a7bd9809aa03..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/psroi_pool_op.h +++ /dev/null @@ -1,295 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class CPUPSROIPoolOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto output_channels = ctx.Attr("output_channels"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - PADDLE_ENFORCE_EQ(input_channels, - output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channels of input " - "X should equal the product of " - "output_channels x pooled_height x pooled_width")); - - auto in_stride = phi::stride(in_dims); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - int rois_num_count = 0; - for (int i = 0; i < rois_batch_size; ++i) { - rois_num_count += rois_num_data[i]; - } - PADDLE_ENFORCE_EQ( - rois_num_count, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and RoisNum must be the same")); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("the rois_batch_size and input(X) " - "batch_size should be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num_with_lod, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and lod must be the same")); - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* input_rois = rois->data(); - - // calculate psroipooling, parallel processing can be implemented per ROI - for (int n = 0; n < rois_num; ++n) { - // set roi batch id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - // Force too small rois to be 1 x 1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute bin size w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - // calculate each pixel of the output feature map. - int out_roi_offset = n * out_stride[0]; - for (int c = 0; c < output_channels; ++c) { - // per category - int out_plane_offset = out_roi_offset + c * out_stride[1]; - for (int ph = 0; ph < pooled_height; ++ph) { - int out_row_offset = out_plane_offset + ph * out_stride[2]; - for (int pw = 0; pw < pooled_width; ++pw) { - // calculate w and h at input feature map - int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); - int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - wstart = std::min(std::max(wstart, 0), width); - hend = std::min(std::max(hend, 0), height); - wend = std::min(std::max(wend, 0), width); - - int output_index = out_row_offset + pw; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_plane_offset = - roi_batch_id * in_stride[0] + input_channel * in_stride[1]; - const T* offset_input_data = input_data + input_plane_offset; - T out_sum = 0.; - bool is_empty = (hend <= hstart) || (wend <= wstart); - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * in_stride[2] + iw; - out_sum += offset_input_data[input_index]; - } - } - T bin_area = (hend - hstart) * (wend - wstart); - output_data[output_index] = is_empty ? 0. : out_sum / bin_area; - } - } - } - } - return; - } -}; - -template -class CPUPSROIPoolGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto output_channels = ctx.Attr("output_channels"); - auto spatial_scale = ctx.Attr("spatial_scale"); - - if (input_grad) { - auto in_dims = in->dims(); - int input_channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - // set roi batch id - framework::Tensor rois_batch_id_list; - rois_batch_id_list.Resize({rois_num}); - int* rois_batch_id_data = - rois_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - rois_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto rois_lod = rois->lod().back(); - rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; - } - } - } - const T* input_rois = rois->data(); - const T* output_grad_data = output_grad->data(); - T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - // set gradient of X to be 0. before backpropagate. - phi::funcs::SetConstant set_zero; - set_zero(ctx.template device_context(), input_grad, - static_cast(0)); - - // backpropagate gradient per output pixel - int output_grad_size = output_grad->numel(); - for (int i = 0; i < output_grad_size; ++i) { - // The output is in order (n, c, ph, pw) - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % output_channels; - int n = i / pooled_width / pooled_height / output_channels; - - // set roi_batch_id - int roi_batch_id = rois_batch_id_data[n]; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - int input_offset = - (roi_batch_id * input_channels + input_channel) * height * width; - T* offset_input_grad_data = input_grad_data + input_offset; - - // [start, end) interval for spatial sampling - const T* offset_input_rois = input_rois + n * 4; - T roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - T roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - T roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - T roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small ROIs to be 1x1 - T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 - T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); - - // Compute w and h at input feature map - T bin_size_h = roi_height / static_cast(pooled_height); - T bin_size_w = roi_width / static_cast(pooled_width); - - int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); - int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); - int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); - int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); - - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - hend = std::min(std::max(hend, 0), height); - wstart = std::min(std::max(wstart, 0), width); - wend = std::min(std::max(wend, 0), width); - bool is_empty = (hend <= hstart) || (wend <= wstart); - - // Accumulate diff_val into input data - T bin_area = static_cast((hend - hstart) * (wend - wstart)); - T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * width + iw; - offset_input_grad_data[input_index] += diff_val; - } - } - } - } - return; - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc index 6b0d6f332bcae8890cdfaccb1244886daa63ae42..54e31845ad4bd5ddfa81bc90a10391f027dffc11 100644 --- a/paddle/fluid/operators/put_along_axis_op.cc +++ b/paddle/fluid/operators/put_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/put_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker, paddle::operators::PutAlongAxisInplaceInferer); REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu deleted file mode 100644 index 5508023efad2c60a00f5ea3a8d1b853c6e5ba1fb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/put_along_axis_op.cu +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/put_along_axis_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PutAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisCUDAKernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - const platform::DeviceContext &device_ctx = ctx.device_context(); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel( - *result_grad, axis, *index, *value_grad, - ctx.device_context()); // the gradient of scatter is gather - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h deleted file mode 100644 index 38487f5ce28c9e35dd6e84403b88dbc0fdfa07b3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/put_along_axis_op.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PutAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisOpKernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - const platform::DeviceContext &device_ctx = ctx.device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'add‘, 'assign', 'mul' and 'multiply', the " - "defalut reduce " - "op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpKernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_input_grad_kernel( - // Here passing an unused argument *result_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - cpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index 24741efe426b18b7cecae9332c522d67aee98d63..c7e91ba35dee1356ddd71ade0fe9892f8032c77b 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 21c23a7f602a35acf676e97a9134c2c43a73126c..4b6759ea165edf29add66ee44461fdd4d9f84d00 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -70,9 +70,25 @@ BufferedReader::BufferedReader( stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + int dev_idx = place_.device; + compute_stream_ = + ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::MluEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx); + } +#endif cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); + mlu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) { platform::NPUStreamSync(stream_.get()); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + TensorVec &mlu = mlu_buffer_[i]; + if (mlu.empty()) { + mlu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + mlu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on MLU and CPU devices are not matched. " + "The number on MLU is %d, on CPU is %d", + mlu.size(), cpu.size())); + } + + std::vector mlu_ptrs; + mlu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + mlu[i].Resize(cpu[i].dims()); + mlu[i].set_layout(cpu[i].layout()); + mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetMLUDeviceId(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS( + cnPlaceNotifier(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto mlu_ptr = mlu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + if ((platform::is_mlu_place(cpu_place))) { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + } else { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + platform::MLUStreamSync(stream_.get()); + } + mlu[i].set_lod(cpu[i].lod()); + } + platform::MLUStreamSync(stream_.get()); + } +#endif return i; })); } @@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector *out) { *out = std::move(cuda_buffer_[i]); } else if (platform::is_npu_place(place_)) { *out = std::move(npu_buffer_[i]); + } else if (platform::is_mlu_place(place_)) { + *out = std::move(mlu_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 3d42486c6df8815aaab8e55e29898700bb74d953..f0f3b6b7f9fdfeb69c46e7122fae5c6cfbf3a169 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -29,6 +29,11 @@ #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_resource_pool.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" +#endif + namespace paddle { namespace operators { namespace reader { @@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector cuda_buffer_; std::vector npu_buffer_; + std::vector mlu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; @@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader { std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_MLU + mluStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 28a8484f539fc94d055aaf9fcbc0a420747d0964..18e444702fbb2cc19912a32587f96330e6e8632d 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -82,8 +82,8 @@ DECLARE_INPLACE_OP_INFERER(RealGradOpInplaceInferer, } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, - PT_INFER_META(phi::RealAndImagInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(real, RealInferShapeFunctor, + PD_INFER_META(phi::RealAndImagInferMeta)); namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu index e8e4ff7010d3df01cda514d51796b789ef5e1da6..a724524716be39e554c6046ca809624b7fbb053a 100644 --- a/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu +++ b/paddle/fluid/operators/reduce_ops/check_reduce_rank_test.cu @@ -39,9 +39,9 @@ TEST(test_reduce_rank_check, all) { } if (is_valid) { - phi::kernels::details::CheckReduceRank(reduce_rank, rank); + phi::funcs::details::CheckReduceRank(reduce_rank, rank); } else { - ASSERT_THROW(phi::kernels::details::CheckReduceRank(reduce_rank, rank), + ASSERT_THROW(phi::funcs::details::CheckReduceRank(reduce_rank, rank), paddle::platform::EnforceNotMet); } } diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc index 955cf8d4448c1b23319fa3e0c10dbd12ae3bf49c..9115d21b195e1b615f43b01af61bbdebd1e70294 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc @@ -14,6 +14,10 @@ #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace framework { class OpDesc; @@ -28,9 +32,17 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_all, ReduceAllInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); +class ReduceAllOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_all"; } + virtual std::string GetOpType() const { return "Reduce reduce_all"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_all, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_all, ops::ReduceOpUseInputPlace, ReduceAllOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAllInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc index fa3800dd3c9e46c20df54d748a61166a75be492b..69561b93498883bdf2adcfa3982d24bc1e727be0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc @@ -14,6 +14,9 @@ #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { class OpDesc; @@ -28,9 +31,18 @@ class CPUDeviceContext; } // namespace platform } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(reduce_any, ReduceAnyInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +class ReduceAnyOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_any"; } + virtual std::string GetOpType() const { return "Reduce reduce_any"; } +}; // kernel's device type is decided by input tensor place, to be consistent with // compare and logical ops -REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any, UseInputPlace); -REGISTER_OP_CPU_KERNEL(reduce_any, - ops::BoolReduceKernel); +REGISTER_OPERATOR( + reduce_any, ops::ReduceOpUseInputPlace, ReduceAnyOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ReduceAnyInferShapeFunctor); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc index d057ee8f5d798f61c13d5c5c166c9d71b6716d6f..e327d19ab3be8daff08b4e358081d2792fd30835 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc @@ -35,7 +35,7 @@ namespace p = paddle::platform; using Tensor = paddle::framework::Tensor; -USE_OP(reduce_any); +USE_OP_ITSELF(reduce_any); USE_OP_DEVICE_KERNEL(reduce_any, NPU); template diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index cb438b4a8057267015c8b3c15dd8468fca5a4b44..41df8e4a15f093a40a31c70eea98dfb7e575f4cd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_max); -REGISTER_OP_CPU_KERNEL( - reduce_max, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMaxOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_max"; } + virtual std::string GetOpType() const { return "Reduce reduce_max"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_max, ops::ReduceOp, ReduceMaxOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMaxInferShapeFunctor); +REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_max_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu deleted file mode 100644 index 8194805ddc3736b365667883447cc13d7b729494..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -// reduce_max -REGISTER_OP_CUDA_KERNEL( - reduce_max, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 6157a3a925de51a9b65efbb2df9d5178132b1baf..4a18330913803f822436118a35fb957b7e31b391 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -96,8 +96,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_mean"; } }; -DELCARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, - PT_INFER_META(phi::MeanRawInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc index 11aa78382e319331dc65ec22927f0d5762adfb43..b9915f2b484f140bfd776b64459a19c6788a55c9 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_min); -REGISTER_OP_CPU_KERNEL( - reduce_min, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMinOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_min"; } + virtual std::string GetOpType() const { return "Reduce reduce_min"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_min, ReduceMinInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_min, ops::ReduceOp, ReduceMinOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMinInferShapeFunctor); +REGISTER_OPERATOR(reduce_min_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_min_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu deleted file mode 100644 index 44548b8d2e778e4a570d085be6f2538b64ab7824..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" - -// reduce_min -REGISTER_OP_CUDA_KERNEL( - reduce_min, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel, - ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 3aab906804f7adb95f80aa2675f01217b0b48d39..160617695338a9f2e140b7b418c93ef0d7c57e17 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -23,8 +23,7 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/gpu/reduce.h" - +#include "paddle/phi/kernels/funcs/reduce_function.h" namespace paddle { namespace operators { @@ -37,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::kernels::TensorReduceImpl( + phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims, stream); + origin_reduce_dims); } } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 8ef0712dc7a757dfe91e48e7b0bb32f24840e02e..2a78774f3706e73bd8931e80fe020faac58d7ff5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -102,8 +102,8 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { virtual std::string GetOpType() const { return "Reduce reduce_sum"; } }; -DELCARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, - PT_INFER_META(phi::ReduceInferMetaBase)); +DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, + PD_INFER_META(phi::SumRawInferMeta)); REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, @@ -114,16 +114,3 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumGradNoNeedBufferVarInferer); - -template -using CPUReduceSumGradKernel = - ops::ReduceSumGradKernel; - -REGISTER_OP_CPU_KERNEL( - reduce_sum_grad, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel>, - CPUReduceSumGradKernel>); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu deleted file mode 100644 index 2f6bf127518090916c4b947daf1d1f202fdd5960..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" - -template -using CUDAReduceSumGradKernel = - ops::ReduceCudaGradKernel; - -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel>, - CUDAReduceSumGradKernel>); diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h index b636184ae457edf5c8028fecfb92a3ea96f5a0d9..a473b54c1f855945a5f3f0ac8d0826b15494ba1a 100644 --- a/paddle/fluid/operators/rnn_op.h +++ b/paddle/fluid/operators/rnn_op.h @@ -16,9 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/unique_op.h" @@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; using TensorList = std::vector; +template +using EigenMatrix = framework::EigenMatrix; + +template +using EigenVector = framework::EigenVector; + #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR) \ inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \ const std::string& mode = ctx.Attr("mode"); \ diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 5627b4f229e100d9979663e8688b8694188bab0f..ac0cd75237baf5e8b860f197d42cd27bae65270e 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -226,11 +226,7 @@ REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradMaker); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, ops::RoiAlignGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - roi_align, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel, - ops::CPUROIAlignOpKernel); + REGISTER_OP_CPU_KERNEL( roi_align_grad, ops::CPUROIAlignGradOpKernel, diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index 18941d10e937d3c28e5793384f00d9d97225a128..1a2e64cd45ca401f5fb8ca6b6975a029ba735280 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -33,43 +33,6 @@ static inline int NumBlocks(const int N) { kNumMaxinumNumBlocks); } -template -__device__ T BilinearInterpolate(const T* input_data, const int height, - const int width, T y, T x) { - if (y < -1.0 || y > height || x < -1.0 || x > width) { - return 0; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - int y_low = static_cast(y); - int x_low = static_cast(x); - int y_high; - int x_high; - if (y_low >= height - 1) { - y_high = y_low = height - 1; - y = static_cast(y_low); - } else { - y_high = y_low + 1; - } - if (x_low >= width - 1) { - x_high = x_low = width - 1; - x = static_cast(x_low); - } else { - x_high = x_low + 1; - } - T ly = y - y_low, lx = x - x_low; - T hy = 1. - ly, hx = 1. - lx; - - T v1 = input_data[y_low * width + x_low]; - T v2 = input_data[y_low * width + x_high]; - T v3 = input_data[y_high * width + x_low]; - T v4 = input_data[y_high * width + x_high]; - T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; - - T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - return val; -} - template __device__ void BilinearInterpolateGradient(const int height, const int width, T y, T x, T* w1, T* w2, T* w3, @@ -102,65 +65,6 @@ __device__ void BilinearInterpolateGradient(const int height, const int width, return; } -template -__global__ void GPUROIAlignForward( - const int nthreads, const T* input_data, const T* input_rois, - const float spatial_scale, const int channels, const int height, - const int width, const int pooled_height, const int pooled_width, - const int sampling_ratio, int* roi_batch_id_data, T* output_data, - const bool continuous_coordinate) { - CUDA_KERNEL_LOOP(i, nthreads) { - int pw = i % pooled_width; - int ph = (i / pooled_width) % pooled_height; - int c = (i / pooled_width / pooled_height) % channels; - int n = i / pooled_width / pooled_height / channels; - - const T* offset_input_rois = input_rois + n * kROISize; - int roi_batch_ind = roi_batch_id_data[n]; - - T roi_offset = continuous_coordinate ? static_cast(0.5) : 0; - T roi_xmin = offset_input_rois[0] * spatial_scale - roi_offset; - T roi_ymin = offset_input_rois[1] * spatial_scale - roi_offset; - T roi_xmax = offset_input_rois[2] * spatial_scale - roi_offset; - T roi_ymax = offset_input_rois[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!continuous_coordinate) { - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - } - - T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); - T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); - - const T* offset_input_data = - input_data + (roi_batch_ind * channels + c) * height * width; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = - (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); - T output_val = 0; - for (int iy = 0; iy < roi_bin_grid_h; iy++) { - const T y = roi_ymin + ph * bin_size_h + - static_cast(iy + .5f) * bin_size_h / - static_cast(roi_bin_grid_h); - for (int ix = 0; ix < roi_bin_grid_w; ix++) { - const T x = roi_xmin + pw * bin_size_w + - static_cast(ix + .5f) * bin_size_w / - static_cast(roi_bin_grid_w); - T val = BilinearInterpolate(offset_input_data, height, width, y, x); - output_val += val; - } - } - output_val /= count; - output_data[i] = output_val; - } -} - template __global__ void GPUROIAlignBackward( const int nthreads, const T* input_rois, const T* out_grad, @@ -236,105 +140,6 @@ __global__ void GPUROIAlignBackward( } } -template -class GPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - - int rois_num = rois->dims()[0]; - - if (rois_num == 0) return; - - int output_size = out->numel(); - int blocks = NumBlocks(output_size); - int threads = kNumCUDAThreads; -#ifdef WITH_NV_JETSON - platform::ChangeThreadNum(ctx.cuda_device_context(), &threads, 256); -#endif - Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - auto cplace = platform::CPUPlace(); - int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); - auto& dev_ctx = ctx.cuda_device_context(); - auto gplace = ctx.GetPlace(); - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - int rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - - std::vector rois_num_list(rois_batch_size); - memory::Copy(cplace, rois_num_list.data(), gplace, - rois_num_t->data(), sizeof(int) * rois_batch_size, 0); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_list[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_list[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ( - lod.empty(), false, - platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does " - "not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and batch size " - "of images must be the same. But received rois batch size = %d, " - "and images batch size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); - int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); - memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, - dev_ctx.stream()); - GPUROIAlignForward<<>>( - output_size, in->data(), rois->data(), spatial_scale, channels, - height, width, pooled_height, pooled_width, sampling_ratio, roi_id_data, - out->mutable_data(ctx.GetPlace()), aligned); - } -}; - template class GPUROIAlignGradOpKernel : public framework::OpKernel { public: @@ -416,10 +221,6 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - roi_align, - ops::GPUROIAlignOpKernel, - ops::GPUROIAlignOpKernel); REGISTER_OP_CUDA_KERNEL( roi_align_grad, ops::GPUROIAlignGradOpKernel, diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h index e71099ed99f00f5846e6e23d5d39b3b2f8997531..589e35e4ab7ae4caf5efd3fb4d93a26b2ca86b26 100644 --- a/paddle/fluid/operators/roi_align_op.h +++ b/paddle/fluid/operators/roi_align_op.h @@ -23,152 +23,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -namespace { // NOLINT -constexpr size_t get_offset(size_t x, size_t y, size_t width) { - return y * width + x; -} - -template -struct offsets_and_ratios { - offsets_and_ratios() = default; - offsets_and_ratios(std::size_t xy, std::size_t xY, std::size_t Xy, - std::size_t XY, T xy_ratio, T xY_ratio, T Xy_ratio, - T XY_ratio) - : xy(xy), - xY(xY), - Xy(Xy), - XY(XY), - xy_ratio(xy_ratio), - xY_ratio(xY_ratio), - Xy_ratio(Xy_ratio), - XY_ratio(XY_ratio) {} - - std::size_t xy = 0; - std::size_t xY = 0; - std::size_t Xy = 0; - std::size_t XY = 0; - T xy_ratio = 0.0f; - T xY_ratio = 0.0f; - T Xy_ratio = 0.0f; - T XY_ratio = 0.0f; -}; - -template -std::vector> get_indexes_and_ratios( - std::size_t width, std::size_t height, const T roi_width, - const T roi_height, const T roi_xmin, const T roi_ymin, - std::size_t pooled_width, std::size_t roi_bin_grid_w, - std::size_t pooled_height, std::size_t roi_bin_grid_h) { - const auto ind_num = - pooled_width * roi_bin_grid_w * pooled_height * roi_bin_grid_h; - - std::vector> interpolation_cords; - interpolation_cords.reserve(ind_num); - - const auto bin_w = roi_width / pooled_width; - const auto bin_h = roi_height / pooled_height; - - for (std::size_t py = 0; py < pooled_height; py++) { - for (std::size_t px = 0; px < pooled_width; px++) { - for (std::size_t iy = 0; iy < roi_bin_grid_h; iy++) { - // calculate x of sample points - auto y = - roi_ymin + - bin_h * (py + - static_cast(iy + .5f) / static_cast(roi_bin_grid_h)); - for (std::size_t ix = 0; ix < roi_bin_grid_w; ix++) { - // calculate x of sample points - auto x = roi_xmin + - bin_w * (px + - static_cast(ix + .5f) / - static_cast(roi_bin_grid_w)); - - // deal with elements out of map - if (y < -1.0 || y > height || x < -1.0 || x > width) { - interpolation_cords.emplace_back(); - continue; - } - y = y <= 0 ? 0 : y; - x = x <= 0 ? 0 : x; - - std::size_t x_low_index = static_cast(x); - std::size_t x_high_index; - if (x_low_index >= width - 1) { - x_high_index = x_low_index = width - 1; - x = static_cast(x_low_index); - } else { - x_high_index = x_low_index + 1; - } - T x_ratio = x_high_index - x; - - std::size_t y_low_index = static_cast(y); - std::size_t y_high_index; - if (y_low_index >= height - 1) { - y_high_index = y_low_index = height - 1; - y = static_cast(y_low_index); - } else { - y_high_index = y_low_index + 1; - } - T y_ratio = y_high_index - y; - - auto xy = get_offset(x_low_index, y_low_index, width); - auto xY = get_offset(x_low_index, y_high_index, width); - auto Xy = get_offset(x_high_index, y_low_index, width); - auto XY = get_offset(x_high_index, y_high_index, width); - - auto xy_ratio = x_ratio * y_ratio; - auto xY_ratio = x_ratio * (1 - y_ratio); - auto Xy_ratio = (1 - x_ratio) * y_ratio; - auto XY_ratio = (1 - x_ratio) * (1 - y_ratio); - - interpolation_cords.emplace_back(xy, xY, Xy, XY, xy_ratio, xY_ratio, - Xy_ratio, XY_ratio); - } - } - } - } - return interpolation_cords; -} // namespace - -template -void interpolate(std::vector& interpolated_values, // NOLINT - const std::vector>& interpolation_cords, - const T* data) { - for (auto& ic : interpolation_cords) { - auto xlyl_offset = ic.xy; - auto xhyl_offset = ic.Xy; - auto xlyh_offset = ic.xY; - auto xhyh_offset = ic.XY; - - auto xlyl_ratio = ic.xy_ratio; - auto xhyl_ratio = ic.Xy_ratio; - auto xlyh_ratio = ic.xY_ratio; - auto xhyh_ratio = ic.XY_ratio; - - interpolated_values.emplace_back( - xlyl_ratio * data[xlyl_offset] + xhyl_ratio * data[xhyl_offset] + - xlyh_ratio * data[xlyh_offset] + xhyh_ratio * data[xhyh_offset]); - } -} - -template -void avg_pool(const std::vector& interpolated_values, T* output_data, - int roi_bin_grid_w, int roi_bin_grid_h, int pooled_width, - int pooled_height) { - const auto data_amount = pooled_width * pooled_height; - const auto grid_points = roi_bin_grid_w * roi_bin_grid_h; - const T count = 1.0 / grid_points; - auto val_begin = interpolated_values.cbegin(); - for (auto i = 0; i < data_amount; ++i) { - T sum = 0.0; - auto val_end = val_begin + grid_points; - sum = std::accumulate(val_begin, val_end, sum); - val_begin = val_end; - output_data[i] = sum * count; - } -} -} // NOLINT - template void bilinear_interpolate_gradient(const int height, const int width, T y, T x, const T out_grad_this_bin, const T count, @@ -213,129 +67,6 @@ void bilinear_interpolate_gradient(const int height, const int width, T y, T x, } } -template -class CPUROIAlignOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* rois = ctx.Input("ROIs"); - auto* out = ctx.Output("Out"); - auto pooled_height = ctx.Attr("pooled_height"); - auto pooled_width = ctx.Attr("pooled_width"); - auto spatial_scale = ctx.Attr("spatial_scale"); - auto sampling_ratio = ctx.Attr("sampling_ratio"); - auto aligned = ctx.Attr("aligned"); - - auto in_dims = in->dims(); - int batch_size = in_dims[0]; - int channels = in_dims[1]; - int height = in_dims[2]; - int width = in_dims[3]; - int rois_num = rois->dims()[0]; - - auto in_stride = phi::stride(in_dims); - auto roi_stride = phi::stride(rois->dims()); - auto out_stride = phi::stride(out->dims()); - - const T* input_data = in->data(); - framework::Tensor roi_batch_id_list; - roi_batch_id_list.Resize({rois_num}); - int* roi_batch_id_data = - roi_batch_id_list.mutable_data(ctx.GetPlace()); - int rois_batch_size; - if (ctx.HasInput("RoisNum")) { - auto* rois_num_t = ctx.Input("RoisNum"); - rois_batch_size = rois_num_t->numel(); - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of rois and the batch size of images " - " must be the same. But received the batch size of rois is %d, " - "and the batch size of images is %d", - rois_batch_size, batch_size)); - auto* rois_num_data = rois_num_t->data(); - int start = 0; - for (int n = 0; n < rois_batch_size; ++n) { - for (int i = start; i < start + rois_num_data[n]; ++i) { - roi_batch_id_data[i] = n; - } - start += rois_num_data[n]; - } - } else { - auto lod = rois->lod(); - PADDLE_ENFORCE_EQ(lod.empty(), false, - platform::errors::InvalidArgument( - "Input(ROIs) Tensor of ROIAlignOp " - "does not contain LoD information.")); - auto rois_lod = lod.back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The rois_batch_size and imgs " - "batch_size must be the same. But received rois_batch_size = %d, " - "batch_size = %d", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ( - rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The actual number of rois and the number of rois " - "provided from Input(RoIsLoD) in RoIAlign must be the same." - " But received actual number of rois is %d, and the number " - "of rois from RoIsLoD is %d", - rois_num, rois_num_with_lod)); - for (int n = 0; n < rois_batch_size; ++n) { - for (std::size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - roi_batch_id_data[i] = n; - } - } - } - T* output_data = out->mutable_data(ctx.GetPlace()); - const T* rois_data = rois->data(); - T roi_offset = aligned ? T(0.5) : 0; - for (int n = 0; n < rois_num; ++n) { - int roi_batch_id = roi_batch_id_data[n]; - T roi_xmin = rois_data[0] * spatial_scale - roi_offset; - T roi_ymin = rois_data[1] * spatial_scale - roi_offset; - T roi_xmax = rois_data[2] * spatial_scale - roi_offset; - T roi_ymax = rois_data[3] * spatial_scale - roi_offset; - - T roi_width = roi_xmax - roi_xmin; - T roi_height = roi_ymax - roi_ymin; - if (!aligned) { - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); - } - - const T* batch_data = input_data + roi_batch_id * in_stride[0]; - - int roi_bin_grid_h = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_height / pooled_height); - int roi_bin_grid_w = (sampling_ratio > 0) - ? sampling_ratio - : ceil(roi_width / pooled_width); - - auto interpolation_cords = get_indexes_and_ratios( - width, height, roi_width, roi_height, roi_xmin, roi_ymin, - pooled_width, roi_bin_grid_w, pooled_height, roi_bin_grid_h); - - std::vector interpolated_values; - interpolated_values.reserve(interpolation_cords.size()); - for (auto channel = 0; channel < channels; ++channel) { - interpolate(interpolated_values, interpolation_cords, batch_data); - avg_pool(interpolated_values, output_data, roi_bin_grid_w, - roi_bin_grid_h, pooled_width, pooled_height); - batch_data += in_stride[1]; - output_data += out_stride[1]; - interpolated_values.clear(); - } - rois_data += roi_stride[0]; - } - } -}; - template class CPUROIAlignGradOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc index d5b63854d99053ac0620a32cfaba267c7262d515..78509e4299b80ee44610ce3d10f9c57afa0cde18 100644 --- a/paddle/fluid/operators/roi_align_op_npu.cc +++ b/paddle/fluid/operators/roi_align_op_npu.cc @@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/roi_align_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc index 09d2d906653e8c71ddeca7fa606cf5adac8cc596..13490d6fcde3a22e7299db21969d7de6f9a6582c 100644 --- a/paddle/fluid/operators/roi_align_op_xpu.cc +++ b/paddle/fluid/operators/roi_align_op_xpu.cc @@ -13,13 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/roi_align_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + template class XPUROIAlignOpKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index 6da73c99068bc0e0453dfdd1b5eca8e1add1954b..7fe6623dcca14afc8fafc4875ccfb7546e4456f0 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -38,7 +38,8 @@ class SaveCombineOp : public framework::OperatorWithKernel { framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { - return expected_kernel_type; + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place()); } }; diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index e4410b21b541320c1d39c3ad155dfce6f74b7dc2..cbf2b9152079e13acd4a221ece402b946b844999 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -121,8 +121,8 @@ DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(scale, ScaleInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, ops::ScaleGradMaker, diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc index bb02bb541e14f551bb749c890877e4753d225c3c..0ae0e1500c16627fc269b31c57b25c47055d7d34 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cc +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -12,10 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter_nd_add_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -24,73 +27,6 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Index"), true, - platform::errors::InvalidArgument( - "Input(Index) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Updates"), true, - platform::errors::InvalidArgument( - "Input(Updates) of ScatterNdAddOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ScatterNdAddOp should not be null.")); - - auto ref_dims = ctx->GetInputDim("X"); - auto ref_dims_size = ref_dims.size(); - auto index_dims = ctx->GetInputDim("Index"); - auto index_dims_size = index_dims.size(); - auto updates_dims = ctx->GetInputDim("Updates"); - auto updates_dims_size = updates_dims.size(); - - PADDLE_ENFORCE_LE( - index_dims[index_dims_size - 1], ref_dims_size, - platform::errors::InvalidArgument( - "The last dimension of Input(Index)'s shape should be no greater " - "than the rank of Input(X), but received the last dimension of " - "Input(Index)'s shape is %d, the rank of Input(X) is %d.", - index_dims[index_dims_size - 1], ref_dims_size)); - PADDLE_ENFORCE_GE(index_dims_size, 2UL, - platform::errors::InvalidArgument( - "The rank of Input(Index) should be greater than 1, " - "but received the rank of Input(Index) is %d.", - index_dims_size)); - - // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:] - std::vector r_updates_dims; - for (int64_t i = 0; i < index_dims_size - 1; ++i) { - r_updates_dims.emplace_back(index_dims[i]); - } - for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) { - r_updates_dims.emplace_back(ref_dims[i]); - } - - PADDLE_ENFORCE_EQ( - r_updates_dims.size(), updates_dims_size, - platform::errors::InvalidArgument( - "Updates has wrong shape. The shape of Updates and Input(Updates) " - "should be same, but received the shape of Updates is %d, " - "the shape of Input(Updates) is %d.", - r_updates_dims.size(), updates_dims_size)); - - for (int64_t i = 0; i < updates_dims_size; ++i) { - PADDLE_ENFORCE_EQ( - r_updates_dims[i], updates_dims[i], - platform::errors::InvalidArgument( - "Updates has wrong shape. The dimensions of Updates and " - "Input(Updates) should match, but received Updates's" - "%d-th dimension is %d, Input(Updates)'s %d-th " - "dimension is %d.", - i, r_updates_dims[i], i, updates_dims[i])); - } - ctx->SetOutputDim("Out", ref_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,7 +35,8 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "Ref and Updates must have same type")); return framework::OpKernelType( - framework::TransToProtoVarType(ctx.Input("X")->type()), + framework::TransToProtoVarType( + ctx.Input("X")->type()), ctx.device_context()); } }; @@ -108,17 +45,6 @@ class ScatterNdAddGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->HasOutput(framework::GradVarName("Updates"))) { - ctx->SetOutputDim(framework::GradVarName("Updates"), - ctx->GetInputDim("Updates")); - } - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -193,22 +119,18 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ScatterNdAddGradNoNeedBufferVarsInferer, namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add, ScatterNdAddInferShapeFunctor, + PD_INFER_META(phi::ScatterNdAddInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(scatter_nd_add_grad, + ScatterNdAddGradInferShapeFunctor, + PD_INFER_META(phi::ScatterNdAddGradInferMeta)); + REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker, ops::ScatterNdAddGradMaker, - ops::ScatterNdAddGradMaker); + ops::ScatterNdAddGradMaker, + ScatterNdAddInferShapeFunctor); REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp, - ops::ScatterNdAddGradNoNeedBufferVarsInferer); - -REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel, - ops::ScatterNdAddOpKernel); - -REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel, - ops::ScatterNdAddGradientOpKernel); + ops::ScatterNdAddGradNoNeedBufferVarsInferer, + ScatterNdAddGradInferShapeFunctor); diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu deleted file mode 100644 index 2fe3fcb759d348b36cd6a7a2609bea210d24705f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_nd_add_op.cu +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter_nd_add_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class ScatterNdAddOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Index"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - - framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::GPUScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } - } -}; - -template -class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Index"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - } - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - auto &dev_ctx = ctx.cuda_device_context(); - // Gradient by Gather - const auto &index_type = Ids->dtype(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::GPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CUDA = paddle::platform::CUDADeviceContext; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(scatter_nd_add, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel, - ops::ScatterNdAddOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel, - ops::ScatterNdAddGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h deleted file mode 100644 index 81c95fe55abaad2e126a52ac7ab97dea24fe67f0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_nd_add_op.h +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class ScatterNdAddOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Index"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - - // In place output: Out = X - framework::TensorCopySync(*X, ctx.GetPlace(), Out); - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s], but " - "desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::ScatterNdAdd(dev_ctx, *Updates, *Ids, Out); - } - } -}; - -template -class ScatterNdAddGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Index"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - } - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - const auto &index_type = Ids->dtype(); - auto &dev_ctx = ctx.template device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::CPUGatherNd(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 3174f07e96e227c8a2f1103d3d6664673c7a2d56..5f6b04cf59e0e3c8c05d44ad6c4a3321ff2516e4 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/scatter_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,46 +26,6 @@ class ScatterOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true, - platform::errors::InvalidArgument( - "Input(Ids) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true, - platform::errors::InvalidArgument( - "Input(Updates) of ScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of ScatterOp should not be null.")); - - auto updates_dims = ctx->GetInputDim("Updates"); - auto ref_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Ids").size(), 1, - platform::errors::InvalidArgument( - "The size of Input(Ids)'s shape should be equal to 1, but " - "received the rank of Input(Ids) is %d.", - ctx->GetInputDim("Ids").size())); - PADDLE_ENFORCE_EQ( - ref_dims.size(), updates_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Updates) should have the same shape size, " - "but received the size of Input(x)'s shape is %d, the size of " - "Input(Updates)'s shape is %d.", - ref_dims.size(), updates_dims.size())); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0], - platform::errors::InvalidArgument( - "Input(Updates) and Input(Ids) should have same batch-size, but" - " received Input(Updates)'s batch-size is %d, Input(Ids)'s " - "batch-size is %d.", - ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0])); - ctx->SetOutputDim("Out", ref_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -76,17 +39,6 @@ class ScatterGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - if (ctx->HasOutput(framework::GradVarName("Updates"))) { - ctx->SetOutputDim(framework::GradVarName("Updates"), - ctx->GetInputDim("Updates")); - } - if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -151,17 +103,17 @@ DECLARE_INPLACE_OP_INFERER(ScatterInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(scatter, ScatterInferShapeFunctor, + PD_INFER_META(phi::ScatterInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(scatter_grad, ScatterGradInferShapeFunctor, + PD_INFER_META(phi::ScatterGradInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, ops::ScatterGradMaker, ops::ScatterGradMaker, - ops::ScatterInplaceInferer); + ops::ScatterInplaceInferer, ScatterInferShapeFunctor); REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp, - ops::ScatterGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel, - ops::ScatterOpKernel, ops::ScatterOpKernel, - ops::ScatterOpKernel); -REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel, - ops::ScatterGradientOpKernel); + ops::ScatterGradNoNeedBufferVarsInferer, + ScatterGradInferShapeFunctor); diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu deleted file mode 100644 index 7755e376bc1956a1f9e09dc2eb8aead9fa083157..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_op.cu +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/gather_op.h" -#include "paddle/fluid/operators/scatter_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" -#include "paddle/phi/kernels/funcs/scatter.cu.h" - -namespace paddle { -namespace operators { - -template -class ScatterOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Ids"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - bool overwrite = ctx.Attr("overwrite"); - - framework::TensorCopy(*X, ctx.GetPlace(), Out); - // use template class to support int32_t and int64_t - auto index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.cuda_device_context(); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, - overwrite); - } else { - phi::funcs::GPUScatterAssign(dev_ctx, *Updates, *Ids, Out, - overwrite); - } - } -}; - -template -class ScatterGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Ids"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - auto index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.cuda_device_context(); - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); - } else { - phi::funcs::GPUScatterGradForX(dev_ctx, *Ids, dX); - } - } - - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == phi::DataType::INT32) { - phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::GPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - scatter_grad, ops::ScatterGradOpCUDAKernel, - ops::ScatterGradOpCUDAKernel, ops::ScatterOpCUDAKernel, - ops::ScatterOpCUDAKernel, - ops::ScatterGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h deleted file mode 100644 index 7733181a93fb60c116ff3da964336b0a85d9a84c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scatter_op.h +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/gather.h" -#include "paddle/phi/kernels/funcs/scatter.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class ScatterOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *X = ctx.Input("X"); - auto *Ids = ctx.Input("Ids"); - auto *Updates = ctx.Input("Updates"); - auto *Out = ctx.Output("Out"); - double overwrite = ctx.Attr("overwrite"); - - // In place output: Out = X, Out[Ids] = Updates - framework::TensorCopy(*X, ctx.GetPlace(), Out); - // Apply ScatterUpdate: Out[index] = Updates[:] - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - auto &dev_ctx = ctx.template device_context(); - if (overwrite) { - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::ScatterAssign(dev_ctx, *Updates, *Ids, Out); - } - } else { - if (index_type == phi::DataType::INT32) { - phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); - } else { - phi::funcs::ScatterAssignAdd(dev_ctx, *Updates, *Ids, Out); - } - } - } -}; - -template -class ScatterGradientOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - auto *dX = ctx.Output(framework::GradVarName("X")); - auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); - auto *Ids = ctx.Input("Ids"); - auto *dOut = ctx.Input(framework::GradVarName("Out")); - - const auto &index_type = Ids->dtype(); - bool index_type_match = index_type == phi::DataType::INT32 || - index_type == phi::DataType::INT64; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "scatter_op index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s]", - index_type, phi::DataType::INT32, phi::DataType::INT64)); - - auto &dev_ctx = ctx.template device_context(); - if (dX) { - framework::TensorCopy(*dOut, ctx.GetPlace(), dX); - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); - } else { - phi::funcs::CPUScatterGradForX(dev_ctx, *Ids, dX); - } - } - - if (dUpdates) { - dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates = dO[Ids] - if (index_type == phi::DataType::INT32) { - phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } else { - phi::funcs::CPUGather(dev_ctx, *dOut, *Ids, dUpdates); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index fa5f03a092882ec1f63e9556bc38d94ed40c9a7f..d5ef95269b48a1a7e7b9c3e75af4f9b595580ad3 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/operators/scatter_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/scatter_op_xpu.cc b/paddle/fluid/operators/scatter_op_xpu.cc index 9f0b74e8a3f80c5c8a22c2db109f75e6ee316be1..07dd2f2d85fe9ac330be1f85d283c85207b1b78c 100644 --- a/paddle/fluid/operators/scatter_op_xpu.cc +++ b/paddle/fluid/operators/scatter_op_xpu.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/scatter_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc index bbd5b9c4e7db914d63c9c803c52d44f9350c1d41..d0290795455db1546afbda80e71e79de3f1020ac 100644 --- a/paddle/fluid/operators/searchsorted_op.cc +++ b/paddle/fluid/operators/searchsorted_op.cc @@ -12,8 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/searchsorted_op.h" - +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -117,10 +116,3 @@ class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(searchsorted, ops::SearchSortedOp, ops::SearchSortedOpMaker); - -REGISTER_OP_CPU_KERNEL( - searchsorted, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel, - ops::SearchSortedKernel); diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc index 322cd97f01c3ad97ba74f049696fdec592ee524e..9d4c8532a82c064b1b7aef759934ad8dad894ec5 100644 --- a/paddle/fluid/operators/segment_pool_op.cc +++ b/paddle/fluid/operators/segment_pool_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/segment_pool_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool"); - OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds", - "SegmentPool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool"); - auto dims = ctx->GetInputDim("X"); - dims[0] = -1; - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pooltype") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds", - "SegmentPool"); - ctx->SetOutputDim("SummedIds", {-1, 1}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor, + PD_INFER_META(phi::SegmentPoolInferMeta)); + REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker, ops::SegmentPoolGradOpMaker, - ops::SegmentPoolGradOpMaker); + ops::SegmentPoolGradOpMaker, + SegmentPoolInferShapeFunctor); REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp); - -REGISTER_OP_CPU_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); - -REGISTER_OP_CPU_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu deleted file mode 100644 index e147e62a98354087121ca1443b20d9163ef00f73..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/segment_pool_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/segment_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); -REGISTER_OP_CUDA_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h deleted file mode 100644 index 2f5ef7f54f988884a25feba4665283d3ce260988..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/segment_pool_op.h +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/segment_pooling.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { - auto* input = context.Input("X"); - auto* segment = context.Input("SegmentIds"); - auto* output = context.Output("Out"); - std::string pooltype = context.Attr("pooltype"); - Tensor* summed_ids = nullptr; - - int64_t num_indices = segment->numel(); - PADDLE_ENFORCE_EQ( - num_indices, input->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be the same size as dimension 0 of input X.")); - PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be 1-D tensor, or it's other " - "dimension size is 1. Segment_ids's shape is: [%s].", - segment->dims())); - - if (input->numel() == 0 || segment->numel() == 0) { - return; - } - - bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU; - if (cpu_place) { - auto dims = input->dims(); - auto* segment_ids = segment->data(); - dims[0] = static_cast(segment_ids[segment->numel() - 1] + 1); - PADDLE_ENFORCE_GT( - dims[0], 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", dims[0])); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, output, static_cast(0)); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (!cpu_place) { - Tensor length; - length.mutable_data(phi::make_ddim({1}), platform::CPUPlace()); - IndexT* length_data = length.data(); - const IndexT* segment_ids = segment->data(); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - hipMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - cudaMemcpyDeviceToHost)); -#endif - - IndexT length_host = length_data[0]; - length_host++; - PADDLE_ENFORCE_GT( - length_host, 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", length_data[0])); - auto dims = input->dims(); - dims[0] = static_cast(length_host); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - T init_value = 0; - if (pooltype == "MAX") { - init_value = static_cast(-FLT_MAX); - } else if (pooltype == "MIN") { - init_value = static_cast(FLT_MAX); - } - phi::funcs::SetConstant setconst; - auto& dev_ctx = context.template device_context(); - setconst(dev_ctx, output, static_cast(init_value)); - // the gpu kernel of mean pool record the counts of segment_ids - if (pooltype == "MEAN") { - summed_ids = context.Output("SummedIds"); - summed_ids->Resize({dims[0], 1}); - summed_ids->mutable_data(context.GetPlace()); - setconst(dev_ctx, summed_ids, static_cast(1e-12)); - } - } -#endif - - SegmentPoolFunctor pool; - - pool(context.template device_context(), *input, *segment, - output, summed_ids, pooltype); -} - -template -class SegmentPoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* segment = context.Input("SegmentIds"); - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentKernelLaunchHelper(context); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentKernelLaunchHelper(context); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -template -class SegmentPoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Input("Out"); - auto* segment = context.Input("SegmentIds"); - auto* out_g = context.Input(framework::GradVarName("Out")); - auto* in_g = context.Output(framework::GradVarName("X")); - std::string pooltype = context.Attr("pooltype"); - - const Tensor* summed_ids = nullptr; - if (pooltype == "MEAN") { - summed_ids = context.Input("SummedIds"); - } - - in_g->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, in_g, static_cast(0)); - - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/selu_op.cc b/paddle/fluid/operators/selu_op.cc index 88ef1f3ea4aa4d8d827a810026575c20e596b4e7..59c6e16535738ba6cbb3224dd4ff5c2987618cdf 100644 --- a/paddle/fluid/operators/selu_op.cc +++ b/paddle/fluid/operators/selu_op.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,10 +31,6 @@ class SeluOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - return UnaryOpUnchangedInferShape(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -121,7 +120,12 @@ class SeluGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(selu, SeluInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(selu, ops::SeluOp, ops::SeluOpMaker, ops::SeluOpInferVarType, ops::SeluGradMaker, - ops::SeluGradMaker); + ops::SeluGradMaker, + SeluInferShapeFunctor); + REGISTER_OPERATOR(selu_grad, ops::SeluGradOp); diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index 6c33ff52044b26b598f835ee40462a01077c1ff8..23c6a0133e1edafba5621825db78a52b88e6947a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); if (in_g || filter_g) { - int r = xpu::constant(xpu_context, col_data, col_numel, T(0)); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); - bool trans_a = false; bool trans_b = true; int m = out_g->dims()[0]; @@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { const T* data_b = filter->data(); T* data_c = col_data; - r = xpu::fc_fusion( + int r = xpu::fc_fusion( xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, xpu::Activation_t::LINEAR); @@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - xpu::constant(xpu_context, in_g->data(), in_g->numel(), T(0)); int r = xpu::sequence_context_projection_grad( xpu_context, in_g->data(), col_data, nullptr, lodx, sequence_width, @@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { if (filter_g) { filter_g->mutable_data(context.GetPlace()); - xpu::constant(xpu_context, filter_g->data(), filter_g->numel(), - T(0)); int r = xpu::sequence_context_projection( xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index ec3e04e71faf0b20950d87de1a7f066e2e49310a..513ab46e9b5eebdb39faf4401d9d8b2fc387a82f 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -241,23 +241,8 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); -REGISTER_OP_CPU_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); - REGISTER_OP_VERSION(set_value) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu deleted file mode 100644 index f9701b0acaac769bd91bbba156a010c2e05e42c3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/set_value_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/set_value_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - -REGISTER_OP_CUDA_KERNEL( - set_value_grad, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel, - ops::SetValueGradKernel); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 9dd727959202c6b09bad0f07aa242a8897583342..4696907f32e6d323c31a27cc6959e26f20168503 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -19,14 +19,10 @@ #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/assign_value_op.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/slice_utils.h" -#include "paddle/fluid/operators/strided_slice_op.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" @@ -36,23 +32,6 @@ namespace operators { using Tensor = framework::Tensor; using DDim = framework::DDim; -inline void GetOffsets(const DDim& big_dim, const DDim& small_dim, - DDim start_offset, int cur_dim, - std::vector* offsets) { - if (cur_dim == big_dim.size()) { - offsets->push_back(start_offset); - return; - } - if (small_dim[cur_dim] == big_dim[cur_dim]) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - } else { - for (int i = 0; i < big_dim[cur_dim]; i++) { - GetOffsets(big_dim, small_dim, start_offset, cur_dim + 1, offsets); - start_offset[cur_dim] += 1; - } - } -} - inline std::string GetValueName(framework::proto::VarType::Type data_type) { std::string value_name; switch (data_type) { @@ -122,447 +101,5 @@ inline void CheckIsDimsMatch(const framework::DDim first, second.to_str(), first.to_str())); } -template -class SetValueKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const int rank = ctx.Input("Input")->dims().size(); - - // TODO(liym27): A more elegent code to do this. C++ has to make template - // integer as constant, but we had better have alternative writing in the - // future. - switch (rank) { - case 1: - SetValueCompute<1>(ctx); - break; - case 2: - SetValueCompute<2>(ctx); - break; - case 3: - SetValueCompute<3>(ctx); - break; - case 4: - SetValueCompute<4>(ctx); - break; - case 5: - SetValueCompute<5>(ctx); - break; - case 6: - SetValueCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", rank)); - } - } - - private: - template - void SetValueCompute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input("Input"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto* out = ctx.Output("Out"); - - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); - - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto steps = ctx.Attr>("steps"); - auto shape = ctx.Attr>("shape"); - auto decrease_axes = ctx.Attr>("decrease_axes"); - auto none_axes = ctx.Attr>("none_axes"); - - if (!starts_tensor_list.empty()) { - starts = GetDataFromTensorList(starts_tensor_list); - } - if (!ends_tensor_list.empty()) { - ends = GetDataFromTensorList(ends_tensor_list); - } - if (!steps_tensor_list.empty()) { - steps = GetDataFromTensorList(steps_tensor_list); - } - - auto in_dims = in->dims(); - CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); - auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); - - auto slice_dims_for_assign = decrease_slice_dims; - if (!none_axes.empty()) { - std::vector slice_dims_with_none; - - size_t none_axes_cur = 0, decrease_axes_cur = 0; - for (int i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= i) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - if (decrease_axes_cur < decrease_axes.size() && - decrease_axes[decrease_axes_cur] == i) { - decrease_axes_cur++; - } else { - slice_dims_with_none.push_back(slice_dims[i]); - } - } - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - - slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); - } - - auto place = ctx.GetPlace(); - auto& eigen_place = - *ctx.template device_context().eigen_device(); - - // Here copy data from input to avoid data loss at PE and Graph level. - // TODO(liym27): Speed up in the future version. - // - Q: Why don't call ShareDataWith to speed up? - // - A: Because it's not supported to ShareDataWith on OP's input and output - // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP - // - Q: Why don't delete Input, after all, the input and output are the same - // Tensor at program level? - // - A: If deleting Input, the graph will be complex, such as there will - // be two ops points to the output in graph: op1 -> output <- set_value. - // In this case, we have to find a way to handle the running order of - // set_value is what we want. - paddle::framework::TensorCopy(*in, place, out); - - Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype()); - slice_tensor.mutable_data(slice_dims, place); - pad_tensor.mutable_data(in_dims, place); - - auto pad_e = framework::EigenTensor::From(pad_tensor, in_dims); - auto out_e = framework::EigenTensor::From(*out); - auto slice_e = framework::EigenTensor::From(slice_tensor, slice_dims); - - // Step 1: Set the value of out at `_index` to zero - slice_e.device(eigen_place) = slice_e.constant(T(0)); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto strides_indices = Eigen::DSizes(); - - for (size_t i = 0; i < D; ++i) { - starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; - strides_indices[i] = 1; - } - for (size_t i = 0; i < axes.size(); i++) { - int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - if (starts[i] == ends[i]) { // slice is empty, data will not be changed - return; - } - } - - out_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 2: Set a tensor with the same shape as out tensor. And its data at - // '_index' is the same as value_tensor, and data out of '_index' to zero - - // - Step 2.1 Set slice tensor with value - - // NOTE(liym27): [ Why resize slice_tensor here? ] - // A: When do broadcasting on slice_tensor and value_tensor, the shape of - // slice_tensor should be decreased dims. - // e.g. - // x[:,0] = value_tensor - // x's shape = [3, 4], value_tensor's shape = [3] - // We get slice_dims = [3, 1], decrease_slice_dims = [3] - // If do broadcasting on Tensor with shape [3, 1] and [3], the result's - // shape is [3, 3], which cross the border; - // If do broadcasting on Tensor with shape [3] and [3], the result's shape - // is [3], which is right. - - slice_tensor.Resize(slice_dims_for_assign); - if (value_tensor != nullptr) { - CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims()); - // ElementwiseComputeEx can do broadcasting - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); - } else { - Tensor value_t(in->dtype()); - auto value_dims = phi::make_ddim(shape); - CheckIsDimsMatch(slice_dims_for_assign, value_dims); - - value_t.mutable_data(value_dims, place); - auto value_name = - GetValueName(framework::TransToProtoVarType(in->dtype())); - CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); - value_t.Resize(value_dims); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, &value_t, -1, SubFunctor(), &slice_tensor); - } - slice_tensor.Resize(slice_dims); - - // - Step 2.2 Pad slice tensor with 0 - pad_e.device(eigen_place) = pad_e.constant(T(0)); - pad_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 3: Set out tensor with value_tensor - out_e.device(eigen_place) = out_e - pad_e; - } -}; - -template -class SetValueGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - int rank = ctx.Input(framework::GradVarName("Out"))->dims().size(); - - switch (rank) { - case 1: - SetValueGradCompute<1>(ctx); - break; - case 2: - SetValueGradCompute<2>(ctx); - break; - case 3: - SetValueGradCompute<3>(ctx); - break; - case 4: - SetValueGradCompute<4>(ctx); - break; - case 5: - SetValueGradCompute<5>(ctx); - break; - case 6: - SetValueGradCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of set_value_grad's input should be less than 7, but " - "received %d.", - rank)); - } - } - - private: - template - void SetValueGradCompute(const framework::ExecutionContext& context) const { - auto starts = context.Attr>("starts"); - auto ends = context.Attr>("ends"); - auto steps = context.Attr>("steps"); - - auto axes_int64 = context.Attr>("axes"); - std::vector axes(axes_int64.begin(), axes_int64.end()); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto steps_indices = Eigen::DSizes(); - auto reverse_axis = Eigen::array(); - - auto list_new_ends_tensor = - context.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - context.MultiInput("StartsTensorList"); - auto list_new_steps_tensor = - context.MultiInput("StepsTensorList"); - - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } - - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } - - if (list_new_steps_tensor.size() > 0) { - steps = GetDataFromTensorList(list_new_steps_tensor); - } - - auto in = context.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE_EQ( - in->IsInitialized(), true, - platform::errors::PermissionDenied( - "The input of `set_value_grad`(%s) has not been initialized", - framework::GradVarName("Out"))); - auto grad_value = context.Output( - framework::GradVarName("ValueTensor")); - auto grad_input = - context.Output(framework::GradVarName("Input")); - auto in_dims = in->dims(); - - auto decrease_axis_int64 = - context.Attr>("decrease_axes"); - std::vector decrease_axis(decrease_axis_int64.begin(), - decrease_axis_int64.end()); - std::vector infer_flags(axes.size(), 1); - std::vector out_dims_vector(in_dims.size(), -1); - StridedSliceOutDims(starts, ends, steps, axes, infer_flags, in_dims, - decrease_axis, out_dims_vector.data(), axes.size(), - false); - - framework::DDim out_dims(phi::make_ddim(out_dims_vector)); - - std::vector reverse_vector(starts.size(), 0); - StridedSliceFunctor(starts.data(), ends.data(), steps.data(), axes.data(), - reverse_vector.data(), in_dims, infer_flags, - decrease_axis, starts.size()); - - for (size_t axis = 0; axis < D; axis++) { - starts_indices[axis] = 0; - ends_indices[axis] = out_dims[axis]; - steps_indices[axis] = 1; - reverse_axis[axis] = false; - } - - for (size_t axis = 0; axis < axes.size(); axis++) { - int axis_index = axes[axis]; - starts_indices[axis_index] = starts[axis]; - ends_indices[axis_index] = ends[axis]; - steps_indices[axis_index] = steps[axis]; - reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; - } - - bool need_reverse = false; - for (size_t axis = 0; axis < axes.size(); axis++) { - if (reverse_vector[axis] == 1) { - need_reverse = true; - break; - } - } - - auto& dev_ctx = context.template device_context(); - auto& place = - *context.template device_context().eigen_device(); - phi::funcs::SetConstant set_zero; - - if (grad_input) { - // Set gradient of `Input` - paddle::framework::TensorCopy(*in, context.GetPlace(), grad_input); - - auto grad_input_t = - framework::EigenTensor::From(*grad_input); - - framework::Tensor tmp(grad_input->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - grad_input_t.stridedSlice(starts_indices, ends_indices, steps_indices) - .device(place) = tmp_t; - } - if (grad_value) { - grad_value->mutable_data(context.GetPlace()); - set_zero(dev_ctx, grad_value, static_cast(0)); - - auto in_t = framework::EigenTensor::From(*in); - - if (grad_value->dims() == out_dims) { - auto grad_value_t = - framework::EigenTensor::From(*grad_value); - if (need_reverse) { - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - grad_value_t.device(place) = tmp_t.reverse(reverse_axis); - } else { - grad_value_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - } - } else { - int out_dims_size = out_dims.size(); - auto grad_value_dims = grad_value->dims(); - auto fake_grad_value_dims = out_dims; - - // Create an extented shape according to the rules of broadcast. - auto grad_value_dims_size = grad_value_dims.size(); - - int num_decrease = 0; - - int decrease_axis_size = decrease_axis.size(); - for (int i = 0; i < out_dims_size; i++) { - if (decrease_axis.end() != - std::find(decrease_axis.begin(), decrease_axis.end(), i)) { - fake_grad_value_dims[i] = 1; - num_decrease++; - } else if (i < out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)) { - fake_grad_value_dims[i] = 1; - } else { - auto index_grad = - i - (out_dims_size - (grad_value_dims_size + - decrease_axis_size - num_decrease)); - fake_grad_value_dims[i] = grad_value_dims[index_grad]; - - PADDLE_ENFORCE_EQ((out_dims[i] == grad_value_dims[index_grad]) || - (grad_value_dims[index_grad] == 1), - true, - platform::errors::InvalidArgument( - "An error occurred while calculating %s: " - "[%s] can not be accumulated into [%s].", - framework::GradVarName("ValueTensor"), - out_dims, grad_value_dims)); - } - } - - VLOG(3) << "Dimensions of " << framework::GradVarName("ValueTensor") - << "([" << grad_value_dims << "])is broadcasted into [" - << fake_grad_value_dims << "]."; - - auto extent = Eigen::DSizes(); - auto offset = out_dims; - for (int i = 0; i < out_dims_size; i++) { - offset[i] = 0; - extent[i] = fake_grad_value_dims[i]; - } - std::vector offsets; - GetOffsets(out_dims, fake_grad_value_dims, offset, 0, &offsets); - - auto grad_value_t = - framework::EigenTensor:: - From(*grad_value, fake_grad_value_dims); - - framework::Tensor tmp(grad_value->dtype()); - tmp.mutable_data(out_dims, context.GetPlace()); - set_zero(dev_ctx, &tmp, static_cast(0)); - auto tmp_t = framework::EigenTensor::From(tmp); - - tmp_t.device(place) = - in_t.stridedSlice(starts_indices, ends_indices, steps_indices); - - // accumulate gradient - for (auto offset : offsets) { - grad_value_t.device(place) = - grad_value_t + - tmp_t.slice(framework::EigenDim::From(offset), extent); - } - if (need_reverse) { - framework::Tensor tmp_value(grad_value->dtype()); - tmp_value.mutable_data(fake_grad_value_dims, context.GetPlace()); - auto tmp_value_t = - framework::EigenTensor::From(tmp_value); - tmp_value_t.device(place) = grad_value_t.reverse(reverse_axis); - grad_value_t.device(place) = tmp_value_t; - } - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 599697059c4dcfa54fa728a8ebf88ad95f387774..46d64333b608b7f3e7b3d83664978d162b6d6e52 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel { .AddInput(std::move(index_indices)) .AddInput(val_temp) .AddOutput(out_temp) +#if (CANN_VERSION_CODE >= 504001) + .AddAttrs({{"use_locking", false}}) +#endif .Run(stream); } }; diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index 5b7ccdde81097a2cfd74c3d65c0679d277b766a3..e2c8359beb1290f7b1b592c1ff24b15986f41f73 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -95,9 +93,3 @@ REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, - ops::ShapeKernel>, - ops::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu deleted file mode 100644 index c6e380a94f84db7de53d0c218682813fcad0128d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shape_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/shape_op.h" -#include "paddle/fluid/platform/complex.h" - -REGISTER_OP_CUDA_KERNEL( - shape, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel>, - paddle::operators::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h deleted file mode 100644 index 39ebcca46a710e0b817792105046af70b6298fc1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shape_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = phi::SelectedRows; - -template -class ShapeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_var = ctx.InputVar("Input"); - framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); - } else { - in_dims = in_var->Get().dims(); - } - auto* out_t = ctx.Output("Out"); - out_t->Resize({in_dims.size()}); - auto out_data = out_t->mutable_data(platform::CPUPlace()); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc index 7bff7b2d668347692309d3695eb46b1fbdb6c7dd..f751ab41014c21fda2403bd69bcd20ad549e40c7 100644 --- a/paddle/fluid/operators/shape_op_npu.cc +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc index 2e9092a643253843ed09ab7475ec3ed723d5e3b8..a62d1b434e76434c3710e45e723060d3f452c91c 100644 --- a/paddle/fluid/operators/shape_op_xpu.cc +++ b/paddle/fluid/operators/shape_op_xpu.cc @@ -10,12 +10,41 @@ * limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include +#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; + +template +class ShapeXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } + } +}; +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel); +REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel); #endif diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc index 54555e494ffe5f2c226c7aabd47b4ce991dab2ec..053a90f2fc9fa2f93c2647c420a046401198bc28 100644 --- a/paddle/fluid/operators/shard_index_op.cc +++ b/paddle/fluid/operators/shard_index_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,27 +23,6 @@ namespace operators { class ShardIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 2, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 2, " - "but the value given is %d.", - x_dims.size())); - if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) { - PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U, - platform::errors::InvalidArgument( - "The last dimension of Input(X) should be 1, " - "but the value given is %d.", - x_dims[x_dims.size() - 1])); - } - - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -114,7 +96,10 @@ Examples: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp, - ops::ShardIndexOpMaker); -REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel, - ops::ShardIndexCPUKernel); +DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor, + PD_INFER_META(phi::ShardIndexInferMeta)); +REGISTER_OPERATOR( + shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ShardIndexInferShapeFunctor); diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu deleted file mode 100644 index 115b3f47d664ba00228343d221d5be70d13a7ff1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shard_index_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/shard_index_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void ShardIndexInner(const T* in_data, T* out_data, - const int64_t numel, const int index_num, - const int nshards, const int shard_id, - const int ignore_value) { - int shard_size = (index_num + nshards - 1) / nshards; - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel) { - assert(in_data[idx] >= 0 && in_data[idx] < index_num); - if (in_data[idx] / shard_size == shard_id) { - out_data[idx] = in_data[idx] % shard_size; - } else { - out_data[idx] = ignore_value; - } - } -} - -using LoDTensor = framework::LoDTensor; - -template -class ShardIndexCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, numel, index_num, nshards, shard_id, ignore_value); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel, - ops::ShardIndexCUDAKernel); diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h deleted file mode 100644 index c2fe3711686d4c4c802fadd66d4bc994232ef5ec..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shard_index_op.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using LoDTensor = framework::LoDTensor; -template -class ShardIndexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - int shard_size = (index_num + nshards - 1) / nshards; - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - for (int64_t i = 0; i < numel; ++i) { - PADDLE_ENFORCE_GE(in_data[i], 0, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be " - "greater or equal to 0, but the value given is %d.", - in_data[i])); - PADDLE_ENFORCE_LT(in_data[i], index_num, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be less " - "than index_num (%d), but the value given is %d.", - index_num, in_data[i])); - if (in_data[i] / shard_size == shard_id) { - out_data[i] = in_data[i] % shard_size; - } else { - out_data[i] = ignore_value; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc index dc2e8ad58f31ce8fe845ecb1f368544704e1d9ad..c875448424a24e686b9a6285725f801d604abc46 100644 --- a/paddle/fluid/operators/shard_index_op_npu.cc +++ b/paddle/fluid/operators/shard_index_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index a4e80343903d5a48dda584dc1f203782adb36787..016ff54645b02e9b3ddfb67595d830ccf5dcfd94 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -12,59 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { using framework::Tensor; +const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "SigmoidCrossEntropyWithLogitsOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank), - phi::slice_ddim(labels_dims, 0, rank), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same shape " - "except the last dimension. But received: the shape of " - "Input(X) is [%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class SigmoidCrossEntropyWithLogitsGradOp @@ -200,23 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR( + sigmoid_cross_entropy_with_logits, + SigmoidCrossEntropyWithLogitsInferShapeFunctor, + PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta)); REGISTER_OPERATOR( sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, - ops::SigmoidCrossEntropyWithLogitsInplaceInferer); + ops::SigmoidCrossEntropyWithLogitsInplaceInferer, + SigmoidCrossEntropyWithLogitsInferShapeFunctor); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); -REGISTER_OP_CPU_KERNEL( - sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel, - ops::SigmoidCrossEntropyWithLogitsKernel); -REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, float>, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu deleted file mode 100644 index 40476d5e11f6a3b0cad21038a3f342d824f3575c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#ifdef __HIPCC__ -static constexpr int kNumCUDAThreads = 256; -#else -static constexpr int kNumCUDAThreads = 512; -#endif -static constexpr int kNumMaxinumNumBlocks = 4096; - -static inline int NumBlocks(const int N) { - return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, - kNumMaxinumNumBlocks); -} - -template -__global__ void GPUSigmoidForward(const T *x_data, const T *label_data, - const int ignore_index, const int limit, - T *out_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); - if ((diff > -eps) && (diff < eps)) { - out_data[i] = static_cast(0.); - counts[i] = 0; - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = real_log(static_cast(1) + real_exp(static_cast(-abs(x)))); - out_data[i] = term1 - term2 + term3; - counts[i] = 1; - } - } -} - -template -__global__ void Sum(const T *counts, int num, const T eps, T *sum) { - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - T in = 0; - for (int i = threadIdx.x; i < num; i += BlockDim) { - in += counts[i]; - } - __syncthreads(); - auto out = - BlockReduce(temp_storage).Reduce(static_cast(in), cub::Sum()); - __syncthreads(); - if (threadIdx.x == 0) { - T a = out > eps ? out : eps; - sum[0] = a; - } -} - -template -__global__ void Div(T *loss, const int num, const T *norm) { - CUDA_KERNEL_LOOP(i, num) { loss[i] /= norm[0]; } -} - -template -__global__ void GPUSigmoidBackward(const T *x_data, const T *label_data, - const int ignore_index, const T *dout_data, - const int limit, T *dx_data, T *counts) { - CUDA_KERNEL_LOOP(i, limit) { - T x = x_data[i]; - T label = label_data[i]; - T dout = dout_data[i]; - T eps = static_cast(1e-5); - T diff = label - static_cast(ignore_index); - if ((diff > -eps) && (diff < eps)) { - dx_data[i] = static_cast(0.); - counts[i] = 0; - } else { - T simoid_x = static_cast(1) / (static_cast(1) + real_exp(-x)); - T diff = simoid_x - label; - dx_data[i] = dout * diff; - counts[i] = 1; - } - } -} - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - - auto &dev_ctx = context.cuda_device_context(); - bool normalize = context.Attr("normalize"); - - // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); - - int limit = Out->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - GPUSigmoidForward<<>>( - X->data(), Labels->data(), ignore_index, limit, out_data, counts); - if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(out_data, limit, norm); - } - } -}; - -// dX = sigmoid(X) - labels -template -class GPUSigmoidCrossEntropyWithLogitsGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - - auto &dev_ctx = context.cuda_device_context(); - // Temporary memory - auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T)); - T *counts = reinterpret_cast(cnt_ptr->ptr()); - - int limit = dX->numel(); - int blocks = NumBlocks(limit); - int threads = kNumCUDAThreads; - GPUSigmoidBackward<<>>( - X->data(), Labels->data(), ignore_index, dOut->data(), limit, - dx_data, counts); - bool normalize = context.Attr("normalize"); - if (normalize) { - auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); - T *norm = reinterpret_cast(norm_ptr->ptr()); - Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( - counts, limit, static_cast(1e-5), norm); - Div<<>>(dx_data, limit, norm); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CUDADeviceContext, double>); -REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, float>, - ops::GPUSigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CUDADeviceContext, double>); diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h deleted file mode 100644 index d2ced490ceff474e1e7624c591a9d142b4199c2f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -const int kIgnoreIndex = -100; - -// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template -class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - Tensor *Out = context.Output("Out"); - int ignore_index = context.Attr("ignore_index"); - auto out_data = Out->mutable_data(context.GetPlace()); - int limit = Out->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - if (static_cast(label) == ignore_index) { - out_data[idx] = static_cast(0.); - } else { - T term1 = (x > 0) ? x : 0; - T term2 = x * label; - T term3 = std::log(static_cast(1) + std::exp(-std::abs(x))); - out_data[idx] = term1 - term2 + term3; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(out_data, out_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -// dX = sigmoid(X) - labels -template -class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - const Tensor *X = context.Input("X"); - const Tensor *Labels = context.Input("Label"); - const Tensor *dOut = context.Input(framework::GradVarName("Out")); - Tensor *dX = context.Output(framework::GradVarName("X")); - auto dx_data = dX->mutable_data(context.GetPlace()); - - int ignore_index = context.Attr("ignore_index"); - int limit = dX->numel(); - auto x_data = X->data(); - auto label_data = Labels->data(); - auto dout_data = dOut->data(); - for (int idx = 0; idx < limit; ++idx) { - T x = x_data[idx]; - T label = label_data[idx]; - T dout = dout_data[idx]; - if (static_cast(label) == ignore_index) { - dx_data[idx] = static_cast(0.); - } else { - T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); - T diff = simoid_x - label; - dx_data[idx] = dout * diff; - } - } - bool normalize = context.Attr("normalize"); - if (normalize) { - int norm = 0; - T eps = static_cast(1e-6); - for (int idx = 0; idx < limit; ++idx) { - T diff = label_data[idx] - static_cast(ignore_index); - if ((diff < -eps) || (diff > eps)) { - norm += 1; - } - } - eps = static_cast(1e-5); - norm = norm > eps ? norm : eps; - std::for_each(dx_data, dx_data + limit, [norm](T &v) { v = v / norm; }); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc index 40852425997f0b1a9cfa0c86180f2f2254efceec..f186f95a2b96117fa56fc17f70d4d0884214af87 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc @@ -12,13 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +const int kIgnoreIndex = -100; void CheckAttrs(const framework::ExecutionContext& ctx) { // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc index 6395aa1caa01b9578d55e1155b0d6cd0d2295e36..c37731580d1212cb47c9e7f18aa4a9ba20af19d8 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_xpu.cc @@ -17,13 +17,15 @@ #include #include -#include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class SigmoidCrossEntropyWithLogitsXPUKernel : public framework::OpKernel { using XPUType = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc index e2381c76f7e45a962fcacff079ca67df9610b6f1..ceb42dcf3e592182867a890bdfe73e237913ee53 100644 --- a/paddle/fluid/operators/sign_op.cc +++ b/paddle/fluid/operators/sign_op.cc @@ -60,8 +60,8 @@ class SignGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(sign, SignInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker, ops::SignGradMaker, diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc index e584c1a4cce1e85344c574526098b034723c3059..84b0f403be03893810ef592db9b2c993cc6b9644 100644 --- a/paddle/fluid/operators/size_op.cc +++ b/paddle/fluid/operators/size_op.cc @@ -44,8 +44,8 @@ Return the number of elements in the input. } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, - PT_INFER_META(phi::SizeInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(size, SizeInferShapeFunctor, + PD_INFER_META(phi::SizeInferMeta)); REGISTER_OPERATOR( size, ops::SizeOp, ops::SizeOpMaker, paddle::framework::EmptyGradOpMaker, diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 374992096605bfef0433992193e54306c3a12858..3840b99dd176d5b348533f3e50f7f90fc3250ea1 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" @@ -23,6 +24,10 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -30,30 +35,6 @@ class SoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of SoftmaxOp is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound("Output(Out) of SoftmaxOp is not found.")); - - auto dim_x = ctx->GetInputDim("X"); - auto rank_x = dim_x.size(); - auto axis = ctx->Attrs().Get("axis"); - PADDLE_ENFORCE_GE(axis, -rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - PADDLE_ENFORCE_LT(axis, rank_x, - platform::errors::InvalidArgument( - "Attr(axis) value should be in range [-R, R-1], " - "R is the rank of Input(X).")); - - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -168,23 +149,6 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::InvalidArgument("Input(Out) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument("Input(Out@GRAD) is not found.")); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Out"), - ctx->GetInputDim(framework::GradVarName("Out")), - platform::errors::InvalidArgument("Input(Out) and its gradients " - "should have a same shape.")); - - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -244,9 +208,14 @@ DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"}); namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(softmax, SoftmaxInferShapeFunctor, + PD_INFER_META(phi::SoftmaxInferMeta)); REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker, ops::SoftmaxOpGradMaker, - ops::SoftmaxInplaceInferer); -REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); + ops::SoftmaxInplaceInferer, SoftmaxInferShapeFunctor); +DECLARE_INFER_SHAPE_FUNCTOR(softmax_grad, SoftmaxGradInferShapeFunctor, + PD_INFER_META(phi::GeneralUnaryGradInferMeta)); +REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad, + SoftmaxGradInferShapeFunctor); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index 3bc55fafd81e18d0a986268ff4692129c6515edc..3148b31a8322e2bab39ad7f723ee59a6db64c204 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/spectral_op.h b/paddle/fluid/operators/spectral_op.h index 2bc5124843c38152d2f5d3ffcef5a5ca24534bfd..a60ec5a4df52b8275a17185a63c8a7d27dd8132b 100644 --- a/paddle/fluid/operators/spectral_op.h +++ b/paddle/fluid/operators/spectral_op.h @@ -23,9 +23,9 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/fluid/operators/math/padding.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/padding.h" #if defined(__NVCC__) || defined(__HIPCC__) #include "thrust/device_vector.h" #endif @@ -389,8 +389,9 @@ class FFTR2CGradKernel : public framework::OpKernel { std::vector pads(rank * 2, 0); pads[axes.back() * 2 + 1] = zero_length; - paddle::operators::math::PaddingFunctor( - rank, ctx, pads, static_cast(0), *dy, &full_dy); + phi::funcs::PaddingFunctor( + rank, ctx.template device_context(), pads, + static_cast(0), *dy, &full_dy); fft_c2c_func(dev_ctx, &full_dy, &complex_dx, axes, normalization, !forward); } diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index 6678320f9ffa61e3e6c51fd806569c2571d63d69..5b8922505cc089d66f0b444fc65ccec8ed051876 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -26,6 +26,52 @@ class SplitOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + platform::errors::InvalidArgument( + "Input(X) of SplitOp should not be null.")); + PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(Out) of SplitOp should not be empty.")); + auto in_dims = ctx->GetInputDim("X"); + auto outs_names = ctx->Outputs("Out"); + size_t axis = static_cast(ctx->Attrs().Get("axis")); + size_t num = static_cast(ctx->Attrs().Get("num")); + std::vector sections = static_cast>( + ctx->Attrs().Get>("sections")); + const size_t outs_number = outs_names.size(); + + if (sections.size() > 0) { + PADDLE_ENFORCE_EQ( + sections.size(), outs_number, + platform::errors::InvalidArgument("tensor split sections size " + "should be equal to output size.")); + } + + if (ctx->HasInput("AxisTensor")) { + auto out_dims = phi::make_ddim(std::vector(in_dims.size(), -1)); + std::vector outs_dims(outs_number, out_dims); + ctx->SetOutputsDim("Out", outs_dims); + for (size_t i = 0; i < outs_number; ++i) { + ctx->ShareLoD("X", "Out", 0, i); + } + return; + } + + bool each_section_is_known = + (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList")); + + auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known, + in_dims, num, sections, axis, outs_number); + ctx->SetOutputsDim("Out", outs_dims); + if (axis != 0) { + // Only pass LoD when not spliting along the first dim. + for (size_t i = 0; i < outs_number; ++i) { + ctx->ShareLoD("X", "Out", 0, i); + } + } + } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -125,10 +171,6 @@ Example: namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor, - PT_INFER_META(phi::SplitInferMeta)); - REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker, - ops::SplitGradMaker, - SplitInferShapeFunctor); + ops::SplitGradMaker); diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index bff8061814ae66f243ca9d863cf866821ede4a32..aa944cfcfbb1713aeb27b501083853abb4ffed40 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -16,9 +16,10 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/pooling.h" namespace paddle { namespace operators { @@ -53,14 +54,20 @@ class SppKernel : public framework::OpKernel { out_level.mutable_data(output_shape, context.GetPlace()); // pooling if (pooling_type == "max") { - math::Pool2dFunctor, T> pool_forward; - math::MaxPool max_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::MaxPool, T> + pool_forward; + phi::funcs::MaxPool max_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, max_process); } else if (pooling_type == "avg") { - math::Pool2dFunctor, T> pool_forward; - math::AvgPool avg_process; + phi::funcs::Pool2dFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPool, T> + pool_forward; + phi::funcs::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, kernel_size, strides, paddings, true, false, &out_level, avg_process); @@ -95,7 +102,9 @@ class SppGradKernel : public framework::OpKernel { std::string pooling_type = context.template Attr("pooling_type"); auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; + phi::funcs::SetConstant< + typename framework::ConvertToPhiContext::TYPE, T> + zero; in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); auto out_stride = phi::stride(out->dims()); @@ -145,14 +154,18 @@ class SppGradKernel : public framework::OpKernel { outgrad_level.Resize(out_shape); // pooling backward if (pooling_type == "max") { - math::MaxPool2dGradFunctor pool2d_backward; + phi::funcs::MaxPool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, T> + pool2d_backward; pool2d_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, in_x_grad); } else if (pooling_type == "avg") { - math::Pool2dGradFunctor, T> + phi::funcs::Pool2dGradFunctor< + typename framework::ConvertToPhiContext::TYPE, + phi::funcs::AvgPoolGrad, T> pool_backward; - math::AvgPoolGrad avg_process; + phi::funcs::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, paddings, true, false, in_x_grad, avg_process); diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h index 58e5440689926497705624a0c64e6cc3d43dbab1..a776a78616b8d6dbac66ccab0d59433b98ae65e4 100644 --- a/paddle/fluid/operators/squeeze_op.h +++ b/paddle/fluid/operators/squeeze_op.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 956544c53609eb29326dc5cf295d978d767ac176..d61f5aa3f634cd2aee1e5c2f34f4467b1697e455 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/strided_memcpy.h b/paddle/fluid/operators/strided_memcpy.h index c92d468f3462c92cd0631383996012afb6edb46b..af29aac6b9052877283271abc12f4dc1da6b8a3e 100644 --- a/paddle/fluid/operators/strided_memcpy.h +++ b/paddle/fluid/operators/strided_memcpy.h @@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx, auto& npu_ctx = reinterpret_cast(ctx); memory::Copy(npu_place, dst + i * dst_after, npu_place, src + i * src_after, sizeof(T) * size, npu_ctx.stream()); +#elif defined(PADDLE_WITH_MLU) + auto& mlu_place = place; + auto& mlu_ctx = reinterpret_cast(ctx); + memory::Copy(mlu_place, dst + i * dst_after, mlu_place, + src + i * src_after, sizeof(T) * size, mlu_ctx.stream()); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Paddle is not compiled with GPU.")); diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc index 664f1031915e4661769d9b2844c5388f0efa91c0..fa8a5e92712ec86a01ca01b7eb644e289c03000a 100644 --- a/paddle/fluid/operators/take_along_axis_op.cc +++ b/paddle/fluid/operators/take_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/take_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp, ops::TakeAlongAxisGradOpMaker); REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu deleted file mode 100644 index b6c62d497b379dda568f661b31366914e6870a7c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/take_along_axis_op.cu +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/take_along_axis_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class TakeAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h deleted file mode 100644 index fc781dbddf2ad25de3728e76d231d0164d46c08e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/take_along_axis_op.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class TakeAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index e05b4de65214c8cf55d099fccc7c18370b2312b7..0a71875d8931ef80846aa7e0c95ce1beab86fd7c 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -79,6 +79,28 @@ static void RuntimeStaticShapeCheck(std::vector runtime_input_shape, model_input_shape_str, runtime_input_shape_str)); } +static paddle::experimental::DataType TRT2FluidDataType( + nvinfer1::DataType type) { + switch (type) { + case nvinfer1::DataType::kFLOAT: + return paddle::experimental::DataType::FLOAT32; + case nvinfer1::DataType::kINT32: + return paddle::experimental::DataType::INT32; + case nvinfer1::DataType::kHALF: + return paddle::experimental::DataType::FLOAT16; + case nvinfer1::DataType::kINT8: + return paddle::experimental::DataType::INT8; +#if IS_TRT_VERSION_GE(7000) + case nvinfer1::DataType::kBOOL: + return paddle::experimental::DataType::BOOL; +#endif + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "unknown fluid datatype in Fluid op converter")); + return paddle::experimental::DataType::FLOAT32; + } +} + static void RuntimeDynamicShapeCheck( const std::string &x, const std::vector &runtime_input_shape, const std::vector &min_input_shape, @@ -520,9 +542,12 @@ class TensorRTEngineOp : public framework::OperatorBase { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::INT32) { buffers[bind_index] = static_cast(t.data()); + } else if (type == framework::proto::VarType::FP16) { + buffers[bind_index] = static_cast(t.data()); } else { - PADDLE_THROW(platform::errors::Fatal( - "The TRT Engine OP only support float/int32_t/int64_t input.")); + PADDLE_THROW( + platform::errors::Fatal("The TRT Engine OP only support " + "float/int32_t/int64_t/float16 input.")); } } @@ -570,9 +595,10 @@ class TensorRTEngineOp : public framework::OperatorBase { "than the number of bindings, but got binding " "index = %d, number of bindings = %d.", bind_index, num_bindings)); - buffers[bind_index] = - static_cast(fluid_t->mutable_data(dev_place)); - + auto trt_type = engine->engine()->getBindingDataType(bind_index); + // get adr and set type + buffers[bind_index] = static_cast( + fluid_t->mutable_data(dev_place, TRT2FluidDataType(trt_type))); output_index += 1; } diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index a7c7e33f58af6ce8f59a301d1fc5ccdf511b608f..1de1b590a1311b81f16ba05e746402e1fc14c556 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/phi/core/ddim.h" -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(softmax); diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index dc12f8e8892a022c6f55f4fe3a6237a7a01fa290..e179149c5bb77bd642f744be48109a941c66febf 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile"); - auto x_dims = ctx->GetInputDim("X"); - auto repeat_times = ctx->Attrs().Get>("repeat_times"); - if (repeat_times.size() == 0) { - repeat_times = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_LE( - x_dims.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, x_dims.size())); - PADDLE_ENFORCE_LE( - repeat_times.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times.size())); - PADDLE_ENFORCE_GE( - repeat_times.size(), 1, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must be positive integers, but the value received is %d.", - repeat_times.size())); - - auto out_rank = - std::max(static_cast(x_dims.size()), repeat_times.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - if (x_dim_vec.size() > repeat_times.size()) { - auto diff = x_dim_vec.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, -1); - } else { - auto diff = repeat_times.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); - } - for (size_t i = 0; i < repeat_times.size(); ++i) { - if (x_dim_vec[i] == -1 || repeat_times[i] == -1) { - out_shape[i] = -1; - } else { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "Every element of the input 'repeat_times' for tile op must be " - "greater than 0, but the value given is %d.", - repeat_times[i])); - out_shape[i] = x_dim_vec[i] * repeat_times[i]; - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - ctx->ShareLoD("X", "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor, + PD_INFER_META(phi::TileInferMeta)); + REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker, ops::TileGradOpMaker, - ops::TileGradOpMaker); + ops::TileGradOpMaker, + TileInferMetaFunctor); REGISTER_OPERATOR(tile_grad, ops::TileGradOp, ops::TileDoubleGradOpMaker, ops::TileDoubleGradOpMaker, ops::TileGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CPU_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CUDA_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#endif diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h deleted file mode 100644 index 1698b5e3c6322e2cd9cbe7cf4839e2fc08627b32..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tile_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -#define MAX_RANK_SUPPORTED 6 - -namespace paddle { -namespace operators { -inline std::vector get_repeat_times( - const framework::ExecutionContext& ctx) { - if (ctx.HasInput("RepeatTimes")) { - auto* repeat_tensor = ctx.Input("RepeatTimes"); - auto* repeat_data = repeat_tensor->data(); - framework::Tensor cpu_repeat_tensor; - if (platform::is_gpu_place(repeat_tensor->place()) || - platform::is_xpu_place(repeat_tensor->place()) || - platform::is_npu_place(repeat_tensor->place())) { - paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), - &cpu_repeat_tensor); - repeat_data = cpu_repeat_tensor.data(); - } - auto vec_repeat_times = - std::vector(repeat_data, repeat_data + repeat_tensor->numel()); - return vec_repeat_times; - } - - auto list_repeat_times_tensor = - ctx.MultiInput("repeat_times_tensor"); - if (list_repeat_times_tensor.size() > 0) { - // get tensor from - std::vector vec_repeat_times; - for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { - auto tensor = list_repeat_times_tensor[i]; - if (platform::is_gpu_place(tensor->place()) || - platform::is_xpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - vec_repeat_times.push_back(*temp.data()); - } else { - vec_repeat_times.push_back(*tensor->data()); - } - } - return vec_repeat_times; - } else { - return ctx.Attr>("repeat_times"); - } -} - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; -template -using EigenTensor = framework::EigenTensor; -using framework::To32BitIndex; - -template -class TileKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, 1, platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, rank)); - auto repeat_times = get_repeat_times(context); - int repeat_times_size = repeat_times.size(); - PADDLE_ENFORCE_GE( - repeat_times_size, 1, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile " - "op must be positive, but the value received is %d.", - repeat_times_size)); - PADDLE_ENFORCE_LE( - repeat_times_size, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times_size)); - rank = std::max(rank, repeat_times_size); - switch (rank) { - case 1: - Tile<1>(context); - break; - case 2: - Tile<2>(context); - break; - case 3: - Tile<3>(context); - break; - case 4: - Tile<4>(context); - break; - case 5: - Tile<5>(context); - break; - case 6: - Tile<6>(context); - break; - } - } - - protected: - template - void Tile(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - - auto in_dims = in0->dims(); - auto repeat_times = get_repeat_times(context); - for (size_t i = 0; i < repeat_times.size(); ++i) { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "All elements of the input 'repeat_times' for tile op must " - "be positive integers, but the value received is %d.", - repeat_times[i])); - } - auto vec_in_dims = phi::vectorize(in_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - PADDLE_ENFORCE_EQ( - repeat_times.size(), vec_in_dims.size(), - platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' and the rank (%d) of the input " - "'repeat_times' for tile op must match after promotion.", - vec_in_dims.size(), repeat_times.size())); - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims(new_in_dims); - for (size_t i = 0; i < repeat_times.size(); ++i) { - out_dims[i] *= repeat_times[i]; - } - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - // use 32-bit index to speed up - bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); - if (use_32bit_index) { - EigenBroadcast, T, Rank>::Eval( - place, To32BitIndex(y), To32BitIndex(x), bcast_dims); - } else { - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } - } -}; - -template -class TileGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto repeat_times = get_repeat_times(context); - auto x_dims = x->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - // 1. reshape_dims_vec is the broadcast parameter. - // 2. reduce_dims_vec is the dimension parameter to compute gradients. For - // each dimension expanded, the gradients should be summed to original - // size. - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - dx->mutable_data(context.GetPlace()); - framework::TensorCopy(*dout, context.GetPlace(), context.device_context(), - dx); - // TensorCopy may change the dims of dx - dx->Resize(x_dims); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "Th rank of the input 'Out@GRAD' for tile_grad op " - " must be greater than or equal to 1, but " - "the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for tile_grad op " - "must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void TileBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..95bfb9f4e1a9d374c66997567f5d80df8b5d8701 --- /dev/null +++ b/paddle/fluid/operators/tile_op_functor.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/operator.h" + +#define MAX_RANK_SUPPORTED 6 + +namespace paddle { +namespace operators { + +inline std::vector get_repeat_times( + const framework::ExecutionContext& ctx) { + if (ctx.HasInput("RepeatTimes")) { + auto* repeat_tensor = ctx.Input("RepeatTimes"); + auto* repeat_data = repeat_tensor->data(); + framework::Tensor cpu_repeat_tensor; + if (platform::is_gpu_place(repeat_tensor->place()) || + platform::is_xpu_place(repeat_tensor->place()) || + platform::is_npu_place(repeat_tensor->place())) { + paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), + &cpu_repeat_tensor); + repeat_data = cpu_repeat_tensor.data(); + } + auto vec_repeat_times = + std::vector(repeat_data, repeat_data + repeat_tensor->numel()); + return vec_repeat_times; + } + + auto list_repeat_times_tensor = + ctx.MultiInput("repeat_times_tensor"); + if (list_repeat_times_tensor.size() > 0) { + // get tensor from + std::vector vec_repeat_times; + for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { + auto tensor = list_repeat_times_tensor[i]; + if (platform::is_gpu_place(tensor->place()) || + platform::is_xpu_place(tensor->place()) || + platform::is_npu_place(tensor->place())) { + framework::Tensor temp; + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); + vec_repeat_times.push_back(*temp.data()); + } else { + vec_repeat_times.push_back(*tensor->data()); + } + } + return vec_repeat_times; + } else { + return ctx.Attr>("repeat_times"); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index 9e306c7be537bc7403812f4907541e1a9671c12a..cea6b458aec782923722cb37fe41c1c4d59292e5 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc index 6b60b167a2465fcb03d8ec088cfa288f9fb14af1..598377587d6f73e0c21abbc4d3819d16eacb1f23 100644 --- a/paddle/fluid/operators/tile_op_xpu.cc +++ b/paddle/fluid/operators/tile_op_xpu.cc @@ -11,11 +11,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class TileXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index d60976928e00cb5ecfde6ca65e0a1b0d5b1ef938..80c9935057cb5d5809fde545bdd0772afdaf2702 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -51,6 +51,19 @@ namespace operators { using Tensor = framework::Tensor; +inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n, + int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + struct SegmentOffsetIter { EIGEN_DEVICE_FUNC explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc index 810afc901df57bfa3c518b2363fb9153ee353762..d1add111e1d24cb711955a9aff06eb19feb35dc9 100644 --- a/paddle/fluid/operators/top_k_v2_op.cc +++ b/paddle/fluid/operators/top_k_v2_op.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include +#include "paddle/fluid/framework/op_registry.h" + namespace paddle { namespace operators { @@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, ops::TopkV2GradOpMaker); REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); - -REGISTER_OP_CPU_KERNEL(top_k_v2, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel) - -REGISTER_OP_CPU_KERNEL( - top_k_v2_grad, ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel) diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu deleted file mode 100644 index 84d8eef53bf72c5dbd5404a889925541374c9823..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_v2_op.cu +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define FIXED_BLOCK_DIM_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kBlockDim = (dim); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM(...) \ - FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) - -template -class TopkV2OpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - - // get the attributes - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - const bool& sorted = static_cast(ctx.Attr("sorted")); - const bool& largest = static_cast(ctx.Attr("largest")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto* k_t = ctx.Input("K"); - if (k_t) { - Tensor k_host; - framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); - k = k_host.data()[0]; - framework::DDim output_dims = output->dims(); - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - const auto& out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - // if get the topK from the last axis - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, input, input_width, input_height, k, output, - indices, largest)) { - // Successed, return. - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - // NOTE: pass lds and dim same to input width. - // NOTE: old matrix implementation of stride is different to eigen. - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - } else { - // if get topK not from the last axis, will tranpose the tensor and get - // TopK - - // first step, prepare the trans args for the tranpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = out_dims[trans[i]]; - } - // second step, tranpose the input - Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - // third step, calcluate the topk - // allocate the tmp cuda memory for the tmp result - Tensor trans_ind; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - Tensor trans_out; - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind, largest)) { - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute( - ndims, dev_ctx, trans_out, output, trans); - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - } - } -}; - -#undef FIXED_BLOCK_DIM_BASE -#undef FIXED_BLOCK_DIM -template -class TopkV2OpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // get the real the axis and the k - if (axis < 0) axis += in_dims.size(); - const int& k = out_dims[axis]; - const int& raw_height = in_dims[axis]; - - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - auto ComputeBlockSize = [](int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - int block_size = ComputeBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - - // lanuch the cuda kernel to assign the grad - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, k); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - top_k_v2, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, float>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, double>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int64_t>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h deleted file mode 100644 index a808207476f3b9be2636741d7b0ac06002ccba08..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_v2_op.h +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - The reason why we need the topk v2 is because the compatibility. We redefine - the NaN is maximum value - in the process of comparing. If do not add the topk v2, will affect the - inference result of model that traing - by the older version paddlepaddle. -*/ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, - int* post) { - *pre = 1; - *post = 1; - *n = dim[axis]; - for (int i = 0; i < axis; ++i) { - (*pre) *= dim[i]; - } - for (int i = axis + 1; i < dim.size(); ++i) { - (*post) *= dim[i]; - } -} - -template -static void FullTopK(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - const int& k, const bool& largest, const bool& sorted) { - // when the k is small, will the partial sort - bool partial_sort_flag = (k * 64) < input_width; - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - // Eigen::DSizes flat2dims(input_height, input_width); - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [&largest](const std::pair& l, const std::pair& r) { - if (largest) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - } else { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - } - }); - } else { - // use the nth-element to get the K-larger or K-small element - if (largest) { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort(col_vec.begin(), col_vec.begin() + k - 1, - [&largest](const std::pair& l, - const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - } - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort( - col_vec.begin(), col_vec.begin() + k - 1, - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - } - } - for (Type j = 0; j < k; ++j) { - t_out[i * k + j] = col_vec[j].first; - t_indices[i * k + j] = col_vec[j].second; - } - } -} - -template -static void FullTopKAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data, - const int& k) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class TopkV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Get the top k elements of each row of input tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - const auto& sorted = static_cast(context.Attr("sorted")); - const auto& largest = static_cast(context.Attr("largest")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - // if K tensor is not null, will the use K tesnor as k - auto* k_t = context.Input("K"); - if (k_t) { - k = k_t->data()[0]; - framework::DDim output_dims = output->dims(); - // accroding to axis to set K value in the dim - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - const auto& out_dims = output->dims(); - if (axis + 1 == in_dims.size()) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - FullTopK(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k, largest, sorted); - } else { - // if the topk dims is not last dim, will tranpose and do topk - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - // get the trans input_dims, out_dims - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - for (size_t i = 0; i < trans.size(); i++) { - trans_out_dims[i] = out_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - // Allocate the temp tensor to the save the topk indices, values - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - Tensor tmp_indices; - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - // get the TopK value - FullTopK(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k, largest, sorted); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - } - } -}; - -template -class TopkV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - const size_t& k = out_dims[axis]; - - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis + 1 == in_dims.size()) { - // allocate the memory for the input_grad - - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - FullTopKAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data, k); - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - // transpose the out_grad, indices - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // Do transpose - TransCompute(ndims, dev_context, *out_grad, - &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - - // Assign the out_grad to tranpose input_grad - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - FullTopKAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out, k); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc index 5b8a6b3e75449508afa5d316d81f97ab815c9ea9..caaae02124c926b9e4be08744e4192dab20ca5d0 100644 --- a/paddle/fluid/operators/top_k_v2_op_mlu.cc +++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index e11070638834c46a6628d652216e1ddddeb2487d..dff5c2d3f39378486bb5d2f8010d005d57b20550 100644 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc index 49daac2ff0da63c542a807dc97925c6989559f14..4d9c39be92eff029e66cdde900318b045c2b531f 100644 --- a/paddle/fluid/operators/top_k_v2_op_xpu.cc +++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/transpose_op.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index 63b914a31a86aef48e952a4877c7beb670075cc4..c6c0fa3c0019eac742a9c70ea53a438f5a474895 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -61,7 +61,7 @@ the 2-D planes specified by dim1 and dim2. )DOC"); } }; -class TraceOpGrad : public framework::OperatorWithKernel { +class TraceGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -107,14 +107,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TraceGradNoNeedBufferVarsInferer, "Input"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, - PT_INFER_META(phi::TraceInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trace, TraceInferShapeFunctor, + PD_INFER_META(phi::TraceInferMeta)); REGISTER_OPERATOR(trace, ops::TraceOp, ops::TraceOpMaker, ops::TraceGradOpMaker, ops::TraceGradOpMaker, TraceInferShapeFunctor); -REGISTER_OPERATOR(trace_grad, ops::TraceOpGrad, +REGISTER_OPERATOR(trace_grad, ops::TraceGradOp, ops::TraceGradNoNeedBufferVarsInferer); /* ========================== register checkpoint ===========================*/ diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index 5617d728a51dc1c5e21053a2af05d062ecc1a22b..fb39034c8e92c1ac39aa1ca6e57d5a08ca1ca9d6 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/triangular_solve_op.cc b/paddle/fluid/operators/triangular_solve_op.cc index 9233917b0931b98d30b736ec9b69fd68c0604d18..35b925ca172b7ccb665978010dbcdd2cb10c9678 100644 --- a/paddle/fluid/operators/triangular_solve_op.cc +++ b/paddle/fluid/operators/triangular_solve_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/triangular_solve_op.h" -#include "paddle/fluid/operators/solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -22,58 +23,6 @@ class TriangularSolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "TriangularSolve"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TriangularSolve"); - - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - auto x_dims_n = x_dims.size(); - auto y_dims_n = y_dims.size(); - - PADDLE_ENFORCE_GE( - x_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor X's dimensions of TriangularSolveOp " - "should be >= 2. But received X's " - "dimensions = %d, X's shape = [%s]", - x_dims.size(), x_dims)); - - PADDLE_ENFORCE_GE( - y_dims_n, 2, platform::errors::InvalidArgument( - "The input tensor Y's dimensions of TriangularSolveOp " - "should be >=2. But received Y's " - "dimensions = %d, Y's shape = [%s]", - y_dims.size(), y_dims)); - - PADDLE_ENFORCE_EQ(x_dims[x_dims_n - 2], x_dims[x_dims_n - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should " - "be square matrices " - "But received X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_dims_n - 2], x_dims[x_dims_n - 1])); - - std::vector x_dims_vec = phi::vectorize(x_dims); - std::vector y_dims_vec = phi::vectorize(y_dims); - - std::vector x_dims_vec_cut(x_dims_vec.begin(), - x_dims_vec.end() - 2); - std::vector y_dims_vec_cut(y_dims_vec.begin(), - y_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(x_dims_vec_cut, y_dims_vec_cut); - - std::vector y_broadcast_dims({expand_batch_portion}); - y_broadcast_dims.insert(y_broadcast_dims.end(), {y_dims_vec[y_dims_n - 2], - y_dims_vec[y_dims_n - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - ctx->SetOutputDim("Out", phi::make_ddim(y_broadcast_dims)); - ctx->ShareLoD("X", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { return framework::OpKernelType( @@ -168,20 +117,15 @@ class TriangularSolveOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(triangular_solve, TriangularSolveInferShapeFunctor, + PD_INFER_META(phi::TriangularSolveInferMeta)); + REGISTER_OPERATOR(triangular_solve, ops::TriangularSolveOp, ops::TriangularSolveOpMaker, ops::TriangularSolveOpInferVarType, ops::TriangularSolveOpGradMaker, - ops::TriangularSolveOpGradMaker); + ops::TriangularSolveOpGradMaker, + TriangularSolveInferShapeFunctor); REGISTER_OPERATOR(triangular_solve_grad, ops::TriangularSolveGradOp); - -REGISTER_OP_CPU_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CPU_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.cu b/paddle/fluid/operators/triangular_solve_op.cu deleted file mode 100644 index 7df98517e8418905f0f8c8ce603762967a8b5f38..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/triangular_solve_op.cu +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" - -namespace paddle { -namespace operators { - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - triangular_solve, - ops::TriangularSolveKernel, - ops::TriangularSolveKernel); - -REGISTER_OP_CUDA_KERNEL( - triangular_solve_grad, - ops::TriangularSolveGradKernel, - ops::TriangularSolveGradKernel); diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h deleted file mode 100644 index 4e68add096ff28f5378b02689248c3957c1e8ae9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/triangular_solve_op.h +++ /dev/null @@ -1,229 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "glog/logging.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/tril_triu_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void triangular_solve(const DeviceContext& context, const Tensor& x, - const Tensor& y, Tensor* out, bool upper, - bool transpose, bool unitriangular) { - // Tensor broadcast use eigen - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(x, y); - - Tensor x_bst(x.type()); - TensorExpand(context, x, &x_bst, x_bst_dims_vec); - - Tensor y_bst(y.type()); - TensorExpand(context, y, &y_bst, y_bst_dims_vec); - - // TriangularSolveFunctor performs calculations in-place - // x_clone should be a copy of 'x' after broadcast - // out should be a copy of 'y' after broadcast - Tensor x_clone(x.type()); - x_clone.Resize(phi::make_ddim(x_bst_dims_vec)); - x_clone.mutable_data(context.GetPlace()); - framework::TensorCopy(x_bst, context.GetPlace(), context, &x_clone); - - out->Resize(phi::make_ddim(y_bst_dims_vec)); - out->mutable_data(context.GetPlace()); - framework::TensorCopy(y_bst, context.GetPlace(), context, out); - - math::TriangularSolveFunctor functor; - functor(context, &x_clone, out, /*left=*/true, upper, transpose, - unitriangular); -} - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& input, Tensor* output, - const framework::ExecutionContext& ctx); -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor& in, Tensor* out, - const framework::ExecutionContext& ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - out->Resize(phi::make_ddim(out_bst_dims)); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - - ReduceKernelFunctor( - &in, out, out_reduce_dims, true, false, ctx) - .template apply(); - out->Resize(phi::make_ddim(out_dims)); - } -}; - -template -class TriangularSolveKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - const auto& dev_ctx = ctx.template device_context(); - triangular_solve(dev_ctx, *x, *y, out, upper, transpose, - unitriangular); - } -}; - -template -class TriangularSolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - const auto* y = ctx.Input("Y"); - const auto* out = ctx.Input("Out"); - const auto* dout = - ctx.Input(framework::GradVarName("Out")); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - bool upper = ctx.template Attr("upper"); - bool transpose = ctx.template Attr("transpose"); - bool unitriangular = ctx.template Attr("unitriangular"); - - auto& dev_ctx = ctx.template device_context(); - - std::vector x_bst_dims_vec; - std::vector y_bst_dims_vec; - std::tie(x_bst_dims_vec, y_bst_dims_vec) = get_broadcast_dims(*x, *y); - - Tensor dy_bst(y->type()); - if (dy) { - dy->mutable_data(y->dims(), dev_ctx.GetPlace()); - dy_bst.Resize(phi::make_ddim(y_bst_dims_vec)); - dy_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate x's conjugate for complex - Tensor x_conj(x->type()); - platform::ForRange x_for_range(dev_ctx, x->numel()); - phi::funcs::ConjFunctor x_functor( - x->data(), x->numel(), - x_conj.mutable_data(x->dims(), dev_ctx.GetPlace())); - x_for_range(x_functor); - - // reuse forward to get dy_bst, and the result has been broadcated. - triangular_solve(dev_ctx, x_conj, *dout, &dy_bst, upper, - !transpose, unitriangular); - - if (dy_bst.dims() == dy->dims()) { - framework::TensorCopy(dy_bst, dev_ctx.GetPlace(), dev_ctx, dy); - } else { - MatrixReduceSumFunctor functor; - functor(dy_bst, dy, ctx); - dy->Resize(y->dims()); - } - } - - Tensor dx_bst(x->type()); - if (dx) { - dx->mutable_data(x->dims(), dev_ctx.GetPlace()); - dx_bst.Resize(phi::make_ddim(x_bst_dims_vec)); - dx_bst.mutable_data(dev_ctx.GetPlace()); - - // calculate out's conjugate for complex - Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - - auto blas = phi::funcs::GetBlas(ctx); - if (transpose) { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, true); - blas.MatMul(out_conj, mat_dim_a, dy_bst, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } else { - auto mat_dim_a = - phi::funcs::CreateMatrixDescriptor(dy_bst.dims(), 0, false); - auto mat_dim_b = - phi::funcs::CreateMatrixDescriptor(out_conj.dims(), 0, true); - blas.MatMul(dy_bst, mat_dim_a, out_conj, mat_dim_b, static_cast(-1), - &dx_bst, static_cast(0)); - } - - Tensor dx_bst_upper(x->type()); - // get upper or lower triangular - dx_bst_upper.Resize(dx_bst.dims()); - dx_bst_upper.mutable_data(dev_ctx.GetPlace()); - - const auto& dims = dx_bst.dims(); - const auto H = dims[dims.size() - 2]; - const auto W = dims[dims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, dx_bst.numel()); - TrilTriuCompute tril_triu_computer(dx_bst.data(), unitriangular, - !upper, H, W, - dx_bst_upper.data()); - x_for_range(tril_triu_computer); - - if (dx_bst_upper.dims() == dx->dims()) { - framework::TensorCopy(dx_bst_upper, dev_ctx.GetPlace(), dev_ctx, dx); - } else { - MatrixReduceSumFunctor functor; - functor(dx_bst_upper, dx, ctx); - dx->Resize(x->dims()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb --- /dev/null +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under +the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TrilTriuXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* x = context.Input("X"); + const auto* x_data = x->data(); + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + + const int diagonal = context.Attr("diagonal"); + const bool lower = context.Attr("lower"); + auto xshape = phi::vectorize(x->dims()); + auto& dev_ctx = context.template device_context(); + int r = 0; + if (lower) { + r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op"); + } else { + r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op"); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + tril_triu, ops::TrilTriuXPUKernel, + ops::TrilTriuXPUKernel); +#endif diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc index 54f4deac80a74e2e471036c2e25d08a582e29a9d..b77775f5a8c094fc7aa05f2f017834681424207f 100644 --- a/paddle/fluid/operators/trunc_op.cc +++ b/paddle/fluid/operators/trunc_op.cc @@ -69,8 +69,8 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle -DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, - PT_INFER_META(phi::UnchangedInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker, diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index 6eb7f922dfdbec41aa1c47d11e1decc259d08689..dc5a66dce16d698f9cfac01e3bdc776d08c2af19 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -17,8 +17,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of TruncatedGaussianRandomOp should not be null.")); - auto shape = ctx->Attrs().Get>("shape"); - std::vector out_dim; - out_dim.reserve(shape.size()); - for (auto dim : shape) { - out_dim.push_back(static_cast(dim)); - } - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "the input shape of TruncatedGaussianRandomOp must be set, " - "But the rank of shape we received is %d", - shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(out_dim)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, - ops::TruncatedGaussianRandomOp, - ops::TruncatedGaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR( + truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor, + PD_INFER_META(phi::TruncatedGaussianRandomInferMeta)); + +REGISTER_OPERATOR( + truncated_gaussian_random, ops::TruncatedGaussianRandomOp, + ops::TruncatedGaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + TruncatedGaussianRandomInferShapeFunctor); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.h b/paddle/fluid/operators/truncated_gaussian_random_op.h index a6ff2f686cb76bb03de8074014f82d6ff9e57bd3..8af6e281424eaabd8d6ea86843b3c13aa36cba47 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.h +++ b/paddle/fluid/operators/truncated_gaussian_random_op.h @@ -1,11 +1,8 @@ /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -140,19 +137,9 @@ T Erfinv(T x) { template struct TruncatedNormal { T mean, std; - T a_normal_cdf; - T b_normal_cdf; - TruncatedNormal(T mean, T std) : mean(mean), std(std) { - auto normal_cdf = [](T x) { - return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; - }; - a_normal_cdf = normal_cdf(-2.0); - b_normal_cdf = normal_cdf(2.0); - } - + TruncatedNormal(T mean, T std) : mean(mean), std(std) {} T operator()(T value) const { - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; + return std::sqrt(2.0) * Erfinv(value) * std + mean; } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 261d9cee2d5cd25c510aacb280b9623f985eb1f7..4ed0dd22ec086923bbe47af192cab8d001ae734f 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -84,8 +84,13 @@ class NPUTruncatedGaussianRandomKernel : public framework::OpKernel { Tensor cpu_tensor(tensor->dtype()); cpu_tensor.Resize(tensor->dims()); T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc index 803b61fbe813f85f48b71d1de7fc41eb26e4b8da..984d9f397cc655b4cfd7e0bc211db1665252272f 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_xpu.cc @@ -32,8 +32,13 @@ class XPUTruncatedGaussianRandomKernel : public framework::OpKernel { auto* tensor = context.Output("Out"); T* data = tensor->mutable_data(context.GetPlace()); - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); + auto normal_cdf = [](float x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + float a_normal_cdf = normal_cdf((-2.0 - mean) / std); + float b_normal_cdf = normal_cdf((2.0 - mean) / std); + std::uniform_real_distribution dist(2.0 * a_normal_cdf - 1.0, + 2.0 * b_normal_cdf - 1.0); TruncatedNormal truncated_normal(mean, std); int64_t size = tensor->numel(); diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index c45b839d5b40bd1d0db25743406bb8cc319f1280..02fed3de6cef74f19a5dd4d8500017e6097a56a4 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -119,8 +119,8 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnfoldGradOpNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; -DELCARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, - PT_INFER_META(phi::UnfoldInferMeta)); +DECLARE_INFER_SHAPE_FUNCTOR(unfold, UnfoldInferShapeFunctor, + PD_INFER_META(phi::UnfoldInferMeta)); REGISTER_OPERATOR(unfold, ops::UnfoldOp, ops::UnfoldOpMaker, ops::UnfoldGradMaker, ops::UnfoldGradMaker, diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index a864c48ad757411861b6d2b3be40361c347601f8..b941dc21c3ab213e5abc2c4c908413b2b6222c41 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -25,8 +25,9 @@ DECLARE_bool(use_curand); #include #include #include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/operators/index_impl.cu.h" #include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/distribution_helper.h" +#include "paddle/phi/kernels/funcs/index_impl.cu.h" #endif namespace paddle { @@ -206,21 +207,21 @@ void UniformRandom(const framework::ExecutionContext& context, if (gen_cuda->GetIsInitPy() && seed_flag) { if (FLAGS_use_curand) { using MT = typename details::MPTypeTrait::Type; - distribution::uniform_distribution dist; - distribution::uniform_transform trans(min, max); - distribution::distribution_and_transform(dev_cxt, tensor, dist, trans); + phi::funcs::uniform_distribution dist; + phi::funcs::uniform_real_transform trans(min, max); + phi::funcs::distribution_and_transform(dev_cxt, tensor, dist, trans); } else { auto seed_offset = gen_cuda->IncrementOffset(1); int64_t gen_offset = size * seed_offset.second; auto func = UniformGeneratorOffset(min, max, seed_offset.first, diag_num, diag_step, diag_val, gen_offset); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } else { auto func = UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); - IndexKernel>(dev_cxt, tensor, func); + phi::IndexKernel>(dev_cxt, tensor, func); } } #endif diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 5ab2004617810b34276632fa487e8f12d7b3b915..1be8f3387dbad85e0dce3593ad61b9c116b10ef0 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -236,7 +236,6 @@ register_unity_group(cc scatter_nd_add_op.cc scatter_op.cc seed_op.cc - segment_pool_op.cc select_input_op.cc select_output_op.cc) register_unity_group(cc @@ -496,8 +495,7 @@ register_unity_group(cu scale_op.cu scatter_nd_add_op.cu scatter_op.cu - seed_op.cu - segment_pool_op.cu) + seed_op.cu) register_unity_group(cu roi_pool_op.cu selu_op.cu diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h index 7f676cbb65ee460cdf639641330d49b5774f95a5..f6112fb59c12252255861825ff9d7b534c542665 100644 --- a/paddle/fluid/operators/unsqueeze_op.h +++ b/paddle/fluid/operators/unsqueeze_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/funcs/blas/blas.h" diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index 3e11c952d15f3460f987f6fa2cb28970f97cc96b..a8ced783744a961eb8ce64983de7e9615763c1b6 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc index bf1cdeed65a8427c19410347209faa099673cb7c..602376d54e0d2a49b6cf4f6a78d332154c188a7e 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cc +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_EQ(in_dims.size(), 3, - platform::errors::InvalidArgument( - "The rank of Input in ViterbiDecode must be 3. But " - "received Input's rank is %d.", - in_dims.size())); - auto length_dims = ctx->GetInputDim("Length"); - PADDLE_ENFORCE_EQ(length_dims.size(), 1, - platform::errors::InvalidArgument( - "The rank of Length in ViterbiDecode must be 1. But " - "received Length's rank is %d.", - length_dims.size())); - auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ( - transition_dims.size(), 2, - platform::errors::InvalidArgument( - "The rank of Transition in ViterbiDecode must be 2. But " - "received Transition's rank is %d.", - transition_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - in_dims[0], length_dims[0], - platform::errors::InvalidArgument( - "The batch size of Input and Length should be equal.")); - PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], - platform::errors::InvalidArgument( - "The number of tags of Input (%d) and Transition " - "(%d) should be equal.", - transition_dims[0], in_dims[2])); - } - ctx->SetOutputDim("Scores", length_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; namespace platform = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor, + PD_INFER_META(phi::ViterbiDecodeInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, - ops::ViterbiDecodeOpMaker); -REGISTER_OP_CPU_KERNEL( - viterbi_decode, ops::ViterbiDecodeKernel, - ops::ViterbiDecodeKernel); + ops::ViterbiDecodeOpMaker, + ViterbiDecodeInferShapeFunctor); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu deleted file mode 100644 index 68628fb2748c424996e7f0ae24594ff04649f8d6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ /dev/null @@ -1,206 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_functor.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/viterbi_decode_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -namespace paddle { -namespace operators { - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -int64_t ComputeBlockSize(int64_t col) { - if (col > 512) - return 1024; - else if (col > 256) - return 512; - else if (col > 128) - return 256; - else if (col > 64) - return 128; - else if (col > 32) - return 64; - else if (col > 16) - return 32; - else if (col > 8) - return 16; - else - return 8; -} - -template