diff --git a/.gitignore b/.gitignore index a2009a1ed30a1c6a17627b06170734fc17390d31..801790d0a472080af607e9fbcde0284902a4ead8 100644 --- a/.gitignore +++ b/.gitignore @@ -6,12 +6,14 @@ paddle/fluid/eager/api/generated/* paddle/fluid/op_use_default_grad_maker_DEV.spec paddle/fluid/op_use_default_grad_maker_PR.spec paddle/phi/api/backward/backward_api.h +paddle/phi/api/backward/sparse_bw_api.h paddle/phi/api/include/api.h paddle/phi/api/include/sparse_api.h paddle/phi/api/lib/api.cc paddle/phi/api/lib/dygraph_api.* paddle/phi/api/lib/backward_api.cc paddle/phi/api/lib/sparse_api.cc +paddle/phi/api/lib/sparse_bw_api.cc paddle/phi/extension.h paddle/phi/include/* paddle/phi/infermeta/generated.* @@ -54,6 +56,7 @@ paddle/infrt/dialect/pd_ops.td paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td tools/infrt/kernels.json +tools/infrt/kernel_signature.json paddle/infrt/dialect/pd_ops_info.h .lit_test_times.txt paddle/infrt/tests/dialect/Output diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c5f711d2918bc2a2f8322cc9cd9f3a603c56ab1..6988434996bcc4745726b34278eb6007fdf8605f 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,6 +53,7 @@ option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF) # to develop some acl related functionality on x86 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF) # Note(zhouwei): It use option above, so put here include(init) include(generic) # simplify cmake module diff --git a/cmake/external/onnxruntime.cmake b/cmake/external/onnxruntime.cmake new file mode 100644 index 0000000000000000000000000000000000000000..2162f87812d130f19262955798f28e2c2adc4bac --- /dev/null +++ b/cmake/external/onnxruntime.cmake @@ -0,0 +1,94 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if (NOT WITH_ONNXRUNTIME) + return() +endif () + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +add_definitions(-DPADDLE_WITH_ONNXRUNTIME) + +SET(ONNXRUNTIME_PROJECT "extern_onnxruntime") +SET(ONNXRUNTIME_PREFIX_DIR ${THIRD_PARTY_PATH}/onnxruntime) +SET(ONNXRUNTIME_SOURCE_DIR ${THIRD_PARTY_PATH}/onnxruntime/src/${ONNXRUNTIME_PROJECT}) +SET(ONNXRUNTIME_INSTALL_DIR ${THIRD_PARTY_PATH}/install/onnxruntime) +SET(ONNXRUNTIME_INC_DIR "${ONNXRUNTIME_INSTALL_DIR}/include" CACHE PATH "onnxruntime include directory." FORCE) +SET(ONNXRUNTIME_LIB_DIR "${ONNXRUNTIME_INSTALL_DIR}/lib" CACHE PATH "onnxruntime lib directory." FORCE) +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}") + + +if (WIN32) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-win-x64-1.10.0.zip") +elseif (APPLE) + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-osx-x86_64-1.10.0.tgz") +else () + SET(ONNXRUNTIME_URL "https://github.com/microsoft/onnxruntime/releases/download/v1.10.0/onnxruntime-linux-x64-1.10.0.tgz") +endif() + + +INCLUDE_DIRECTORIES(${ONNXRUNTIME_INC_DIR}) # For ONNXRUNTIME code to include internal headers. +if (WIN32) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.dll" CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/onnxruntime.lib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) +elseif (APPLE) + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.1.10.0.dylib" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +else () + SET(ONNXRUNTIME_SOURCE_LIB "${ONNXRUNTIME_SOURCE_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME source library." FORCE) + SET(ONNXRUNTIME_LIB "${ONNXRUNTIME_INSTALL_DIR}/lib/libonnxruntime.so.1.10.0" CACHE FILEPATH "ONNXRUNTIME static library." FORCE) + SET(ONNXRUNTIME_SHARED_LIB ${ONNXRUNTIME_LIB} CACHE FILEPATH "ONNXRUNTIME shared library." FORCE) +endif () + +if (WIN32) + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_SHARED_LIB} && + ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_DIR}/lib/onnxruntime.lib ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +else () + ExternalProject_Add( + ${ONNXRUNTIME_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + URL ${ONNXRUNTIME_URL} + PREFIX ${ONNXRUNTIME_PREFIX_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${ONNXRUNTIME_SOURCE_LIB} ${ONNXRUNTIME_LIB} && + ${CMAKE_COMMAND} -E copy_directory ${ONNXRUNTIME_SOURCE_DIR}/include ${ONNXRUNTIME_INC_DIR} + BUILD_BYPRODUCTS ${ONNXRUNTIME_LIB} + ) +endif() + +ADD_LIBRARY(onnxruntime STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET onnxruntime PROPERTY IMPORTED_LOCATION ${ONNXRUNTIME_LIB}) +ADD_DEPENDENCIES(onnxruntime ${ONNXRUNTIME_PROJECT}) diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake new file mode 100644 index 0000000000000000000000000000000000000000..661c3675c84b27a7ed8210fec0cfeaa2c858487c --- /dev/null +++ b/cmake/external/paddle2onnx.cmake @@ -0,0 +1,96 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_ONNXRUNTIME) + return() +endif() + +if (WITH_ARM) + message(SEND_ERROR "The current onnxruntime backend doesn't support ARM cpu") + return() +endif () + +INCLUDE(ExternalProject) + +SET(PADDLE2ONNX_PROJECT "extern_paddle2onnx") +SET(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx) +SET(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx) +SET(PADDLE2ONNX_INC_DIR "${PADDLE2ONNX_INSTALL_DIR}/include" CACHE PATH "paddle2onnx include directory." FORCE) +SET(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git) +SET(PADDLE2ONNX_TAG cpp) +SET(LIBDIR "lib") +SET(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}") + +INCLUDE_DIRECTORIES(${PADDLE2ONNX_INC_DIR}) # For PADDLE2ONNX code to include internal headers. +if(WIN32) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib" CACHE FILEPATH "paddle2onnx static library." FORCE) + SET(PADDLE2ONNX_SHARED_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll" CACHE FILEPATH "paddle2onnx shared library." FORCE) +elseif(APPLE) + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +else() + SET(PADDLE2ONNX_LIB "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so" CACHE FILEPATH "PADDLE2ONNX library." FORCE) +endif(WIN32) + + +# The protoc path is required to compile onnx. +string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE}) +list(POP_BACK PROTOC_BIN_PATH) +list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH) + + +set(PADDLE2ONNX_OPTIONAL_ARGS + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH} + -DWITH_STATIC=OFF + -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} +) + +if (WITH_PYTHON) + set(PADDLE2ONNX_OPTIONAL_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE} + -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR} + -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY} + ) +endif () + + +ExternalProject_Add( + ${PADDLE2ONNX_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY} + GIT_TAG ${PADDLE2ONNX_TAG} + DEPENDS protobuf + PREFIX ${PADDLE2ONNX_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB} +) + +ADD_LIBRARY(paddle2onnx STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB}) +ADD_DEPENDENCIES(paddle2onnx ${PADDLE2ONNX_PROJECT}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index f7cb7716969f5ccaa97d1ad7964510376b86870a..58ff5f0d2b715d117018eb2ff3d5989c8beb0694 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,7 +198,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() - if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + + if(WITH_ONNXRUNTIME) + SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + SET(PROTOBUF_TAG v3.18.0) + elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) SET(PROTOBUF_TAG v3.8.0) elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) @@ -248,7 +252,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -if(WITH_ASCEND OR WITH_ASCEND_CL) +if(WITH_ONNXRUNTIME) + SET(PROTOBUF_VERSION 3.18.0) +elseif(WITH_ASCEND OR WITH_ASCEND_CL) SET(PROTOBUF_VERSION 3.8.0) elseif(WITH_IPU) SET(PROTOBUF_VERSION 3.6.1) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 45a76fdc1f1a2aab66e7f4972eecbbec03af941a..cfbe68eecbaca55c5a288aae2c985bbc33d37be2 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -36,7 +36,7 @@ ENDIF() if(NOT DEFINED XPU_BASE_URL) SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") - SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228") + SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220307") else() SET(XPU_BASE_URL "${XPU_BASE_URL}") endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c48d31f7e4f90296ecc48acb56e619aae129106e..851bd81403a85e52fbbb3c4c8bf0da1df63c8848 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -114,6 +114,24 @@ function(copy_part_of_thrid_party TARGET DST) endif() endif() + if (WITH_ONNXRUNTIME) + set(dst_dir "${DST}/third_party/install/onnxruntime") + copy(${TARGET} + SRCS ${ONNXRUNTIME_INC_DIR} ${ONNXRUNTIME_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) + + set(dst_dir "${DST}/third_party/install/paddle2onnx") + if(WIN32) + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB} ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib) + else() + copy(${TARGET} + SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB} + DSTS ${dst_dir}/include ${dst_dir}/lib) + endif() + endif() + set(dst_dir "${DST}/third_party/install/gflags") copy(${TARGET} SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 9e8c81c2985b702fb8bf608d5a2d4c3a7d630564..1291e60cfe4ce13ca9aeeb3f8bdf068af0d5832c 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -478,7 +478,7 @@ function(op_library TARGET) if (${pybind_flag} EQUAL 0) # NOTE(*): activation use macro to regist the kernels, set use_op manually. if(${TARGET} STREQUAL "activation") - file(APPEND ${pybind_file} "USE_OP(relu);\n") + file(APPEND ${pybind_file} "USE_OP_ITSELF(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") elseif(${TARGET} STREQUAL "fake_quantize") diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f6e15758379ada165a9dc0e31273a533b06ad2df..ebb686d8ad0f31917e64161d6f7d2ecd4644fadd 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -134,8 +134,8 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu) endif() - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) - list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}.cu) endif() endif() if (WITH_XPU) @@ -197,92 +197,88 @@ function(kernel_library TARGET) # kernel source file level # level 1: base device kernel - # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs + # - cpu_srcs / gpu_srcs / xpu_srcs / gpudnn_srcs / kps_srcs # level 2: device-independent kernel # - common_srcs # level 3: Kernel implemented by reusing device-independent kernel # - selected_rows_srcs + set(base_device_kernels) + set(device_independent_kernel) + set(high_level_kernels) - # Build Target according different src organization - if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND - (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) - # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. + # 1. Base device kernel compile + if (${cpu_srcs_len} GREATER 0) + cc_library(${TARGET}_cpu SRCS ${cpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_cpu) + endif() + if (${gpu_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + nv_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) - endif() + hip_library(${TARGET}_gpu SRCS ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpu) + endif() + if (${xpu_srcs_len} GREATER 0) + cc_library(${TARGET}_xpu SRCS ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_xpu) + endif() + if (${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + nv_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - elseif (WITH_XPU_KP) - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) - xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() - else() - if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) - cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - endif() + hip_library(${TARGET}_gpudnn SRCS ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() - # If the selected_rows_srcs depends on common_srcs, build target using this rule. - elseif (${common_srcs_len} GREATER 0 AND ${selected_rows_srcs_len} GREATER 0) + list(APPEND base_device_kernels ${TARGET}_gpudnn) + endif() + if (${kps_srcs_len} GREATER 0) + # only when WITH_XPU_KP, the kps_srcs_len can be > 0 + xpu_library(${TARGET}_kps SRCS ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + list(APPEND base_device_kernels ${TARGET}_kps) + endif() + + # 2. Device-independent kernel compile + if (${common_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + nv_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + hip_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + xpu_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) else() - cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part) + cc_library(${TARGET}_common SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels}) endif() - # If there are only common_srcs or selected_rows_srcs, build target using below rules. - elseif (${common_srcs_len} GREATER 0) + list(APPEND device_independent_kernel ${TARGET}_common) + endif() + + # 3. Reusing kernel compile + if (${selected_rows_srcs_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) else() - cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET}_sr SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel}) endif() - elseif (${selected_rows_srcs_len} GREATER 0) + list(APPEND high_level_kernels ${TARGET}_sr) + endif() + + # 4. Unify target compile + list(LENGTH base_device_kernels base_device_kernels_len) + list(LENGTH device_independent_kernel device_independent_kernel_len) + list(LENGTH high_level_kernels high_level_kernels_len) + if (${base_device_kernels_len} GREATER 0 OR ${device_independent_kernel_len} GREATER 0 OR + ${high_level_kernels_len} GREATER 0) if (WITH_GPU) - nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + nv_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_ROCM) - hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + hip_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) elseif (WITH_XPU_KP) - xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + xpu_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) else() - cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + cc_library(${TARGET} DEPS ${kernel_library_DEPS} ${kernel_deps} ${base_device_kernels} ${device_independent_kernel} ${high_level_kernels}) endif() else() set(target_build_flag 0) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index ac3eff04d5383ecdf6c771babcaf3e6811600ac3..7df095c6c2ec04e1a694ed2458787af285c96a9a 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -250,6 +250,12 @@ IF(WITH_TESTING OR WITH_DISTRIBUTE) list(APPEND third_party_deps extern_gtest) ENDIF() +if(WITH_ONNXRUNTIME) + include(external/onnxruntime) # download, build, install onnxruntime态paddle2onnx + include(external/paddle2onnx) + list(APPEND third_party_deps extern_onnxruntime extern_paddle2onnx) +endif() + if(WITH_GPU) if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) include(external/cub) # download cub diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 96bc4a710f8c1c3c38b049368b204daad5dcd3f2..f88c993d85e2fa6eda27b7e845ee27f08347fa83 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -7,3 +7,6 @@ cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup) if(WITH_NCCL) cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() +if(WITH_ASCEND_CL) + cc_library(processgroup_hccl SRCS ProcessGroupHCCL.cc DEPS place npu_stream enforce collective_helper device_context phi phi_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/HCCLTools.h b/paddle/fluid/distributed/collective/HCCLTools.h new file mode 100644 index 0000000000000000000000000000000000000000..09789bd4d378630f548f931bcac00fda89ef33be --- /dev/null +++ b/paddle/fluid/distributed/collective/HCCLTools.h @@ -0,0 +1,174 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/npu/enforce_npu.h" +#include "paddle/fluid/platform/device/npu/npu_info.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +class NPUEventManager { + public: + NPUEventManager() = default; + + ~NPUEventManager() { + if (is_created_) { + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventDestroy(event_); + } + } + + NPUEventManager(const NPUEventManager&) = delete; + NPUEventManager& operator=(const NPUEventManager&) = delete; + + NPUEventManager(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + NPUEventManager& operator=(NPUEventManager&& other) { + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + aclrtEvent GetRawNPUEvent() const { return event_; } + + void Record(const paddle::platform::NPUDeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "NPUDeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::NPUDeviceGuard guard(device_index_); + platform::NPUEventRecord(event_, ctx.stream()); + } + + bool Query() const { + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(event_, &status); + if (status == ACL_EVENT_STATUS_COMPLETE) { + return true; + } + return false; + } + + void Block(const paddle::platform::NPUDeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::NPUDeviceGuard guard(device_index_); + platform::NPUStreamWaitEvent(ctx.stream(), event_); + } + } + + private: + bool is_created_{false}; + aclrtEvent event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::NPUDeviceGuard guard(device_index); + platform::NPUEventCreate(&event_); + is_created_ = true; + } +}; + +class HCCLCommManager { + public: + explicit HCCLCommManager(HcclComm hcclComm) : hccl_comm_(hcclComm) {} + + HCCLCommManager() : HCCLCommManager(nullptr) {} + + ~HCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (hccl_comm_) { + platform::dynload::HcclCommDestroy(hccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + HcclRootInfo* comm_id, + HcclComm hccl_comm) { + auto hccl_manager = std::make_shared(); + auto ret = platform::dynload::HcclCommInitRootInfo(num_ranks, comm_id, rank, + &hccl_comm); + using __NPU_STATUS_TYPE__ = decltype(ret); + constexpr auto __success_type__ = + platform::details::NPUStatusType<__NPU_STATUS_TYPE__>::kSuccess; + if (UNLIKELY(ret != __success_type__)) { + VLOG(0) << "Error: create hccl_id error."; + exit(-1); + } + + hccl_manager->hccl_id_ = comm_id; + hccl_manager->rank_ = rank; + hccl_manager->hccl_comm_ = hccl_comm; + return hccl_manager; + } + + HcclRootInfo* GetHcclId() const { + std::unique_lock lock(mutex_); + return hccl_id_; + } + + HcclComm GetHcclComm() const { + std::unique_lock lock(mutex_); + return hccl_comm_; + } + + HCCLCommManager(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(const HCCLCommManager&) = delete; + HCCLCommManager& operator=(HCCLCommManager&& other) = delete; + + HCCLCommManager(HCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(hccl_comm_, other.hccl_comm_); + } + + protected: + HcclComm hccl_comm_; + HcclRootInfo* hccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc new file mode 100644 index 0000000000000000000000000000000000000000..84f5ca48d25c84b3ba29dbff43952fbf08b22cb9 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.cc @@ -0,0 +1,356 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupHCCL.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device/npu/hccl_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/api/include/api.h" +#include "paddle/phi/common/place.h" + +DECLARE_bool(hccl_blocking_wait); +// DECLARE_bool(use_stream_safe_npu_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static HcclReduceOp ToHCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, HCCL_REDUCE_MIN}, + {ReduceOp::MAX, HCCL_REDUCE_MAX}, + {ReduceOp::SUM, HCCL_REDUCE_SUM}, + {ReduceOp::PRODUCT, HCCL_REDUCE_PROD}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ( + it != red_type.end(), true, + platform::errors::InvalidArgument("Invalid hccl reduction. " + "Must be Min | Max | Prod | Sum")); + return it->second; +} + +std::string SerializeHCCLUniqueId(const HcclRootInfo& hcclID) { + const uint8_t* bytes = reinterpret_cast(&hcclID); + std::ostringstream oss; + for (size_t i = 0; i < sizeof(hcclID); ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +// bool CheckTensorsInNPUPlace(const std::vector& tensors) { +// return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { +// return t.place() == platform::DeviceType::NPU; +// }); +// } + +void SyncDefaultStream( + const std::vector& places, + std::vector& hcclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + hcclEvents[i].Record(*dev_ctx[i]); + hcclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupHCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupHCCL::HCCLTask::HCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + hcclComms_.resize(places.size()); +} + +ProcessGroupHCCL::HCCLTask::~HCCLTask() {} + +void ProcessGroupHCCL::HCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupHCCL::HCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + platform::NPUStreamWaitEvent(default_ctx->stream(), + control_events_[i].GetRawNPUEvent()); + } +} + +bool ProcessGroupHCCL::HCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sandyhouse): Add timeout for wait, now timeout unused +bool ProcessGroupHCCL::HCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + if (FLAGS_hccl_blocking_wait) { + // NOTE(sandyhouse): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + } + return true; +} + +// Same as Wait +void ProcessGroupHCCL::HCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupHCCL::ProcessGroupHCCL(const std::shared_ptr& store, + int rank, int size) + : ProcessGroup(rank, size), store_(store) {} + +void ProcessGroupHCCL::BroadcastUniqueHCCLID( + std::vector& hccl_ids) { // NOLINT + if (rank_ == 0) { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto hccl_id = std::vector( + reinterpret_cast(&hccl_ids[i]), + reinterpret_cast(&hccl_ids[i]) + sizeof(HcclRootInfo)); + store_->set(key, hccl_id); + } + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto key = "ProcessGroupHCCL/hccl_ids/" + std::to_string(i); + auto ret = store_->get(key); + std::memcpy(&hccl_ids[i], ret.data(), ret.size()); + } + } +} + +// create HCCLManager cache for places_key +void ProcessGroupHCCL::CreateHCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the HCCL Communicator since " + "the NPU place are not known")); + + std::vector> hccl_comms; + hccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector hccl_ids; + hccl_ids.resize(1); + auto& hccl_id = hccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(&hccl_id)); + } + BroadcastUniqueHCCLID(hccl_ids); + + VLOG(3) << "init hccl rank: " << rank_ << ", nranks: " << size_ + << ", place: " << places_key + << ", hccl uniqueid: " << SerializeHCCLUniqueId(hccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + std::unique_ptr comms(new HcclComm[places.size()]); + for (size_t i = 0; i < places.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + hccl_comms[i] = HCCLCommManager::Create(GetSize(), GetRank(), &hccl_id, + comms.get() + i); + dev_ctx[i].reset(new NPUDeviceContext(places[i])); + } + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_hcclcomm_.emplace(places_key, std::move(hccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupHCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < inputs.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(inputs[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], hccl_comms[i]->GetHcclComm(), hccl_stream); + } + + for (size_t i = 0; i < inputs.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +template +std::shared_ptr ProcessGroupHCCL::PointToPoint( + std::vector& tensors, Fn fn, int dst_rank, CommType op_type) { + const auto places = GetPlaceList(tensors); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_hcclcomm_.find(key) == places_to_hcclcomm_.end()) { + CreateHCCLManagerCache(key, places); + } + } + + auto& hccl_comms = places_to_hcclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, tensors); + + // construct uninitialize guard for device + + // if (FLAGS_use_stream_safe_npu_allocator) { + // for (size_t i = 0; i < tensors.size(); ++i) { + // platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + // auto dense_tensor = + // std::dynamic_pointer_cast(tensors[i].impl()); + // memory::RecordStream(dense_tensor->Holder(), + // places_to_ctx_[key][i]->stream()); + // } + // } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + const auto& hccl_stream = places_to_ctx_[key][i]->stream(); + fn(tensors[i], hccl_comms[i]->GetHcclComm(), hccl_stream, dst_rank); + } + + for (size_t i = 0; i < tensors.size(); ++i) { + platform::NPUDeviceGuard guard(places[i].GetDeviceId()); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupHCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // NPUPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), + ToHCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupHCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + // PADDLE_ENFORCE_EQ( + // CheckTensorsInNPUPlace(tensors), true, + // platform::errors::InvalidArgument("All inputs should be in + // CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, HcclComm comm, + const aclrtStream& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::HcclBroadcast( + input_tensor->data(), input_tensor->numel(), + platform::ToHCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupHCCL.h b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h new file mode 100644 index 0000000000000000000000000000000000000000..f2376b4eed7600f67d6e4564b44920cbe3936f76 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupHCCL.h @@ -0,0 +1,152 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/device/npu/npu_stream.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/distributed/collective/HCCLTools.h" +#include "paddle/fluid/distributed/store/store.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +constexpr const char* HCCL_BACKEND_NAME = "HCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using NPUStream = platform::stream::NPUStream; +using NPUDeviceContext = paddle::platform::NPUDeviceContext; + +class ProcessGroupHCCL : public ProcessGroup { + public: + class HCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + HCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~HCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> hcclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(HCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + std::shared_ptr Barrier( + const BarrierOptions& = BarrierOptions()) override; + + std::shared_ptr Send(std::vector& tensors, + int dst_rank) override; + + std::shared_ptr Recv(std::vector& tensors, + int src_rank) override; + + std::shared_ptr AllGather( + std::vector& in_tensors, + std::vector& out_tensors) override; + + std::shared_ptr AllToAll( + std::vector& in, std::vector& out) override; + + std::shared_ptr Reduce( + std::vector& tensors, const ReduceOptions& opts) override; + + std::shared_ptr Scatter(std::vector& in_tensors, + std::vector& out_tensors, + const ScatterOptions&) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + std::shared_ptr store_; + std::shared_ptr hccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_hcclcomm_; + + std::unordered_map> + places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + std::set used_place_ids_; + + private: + void BcastHCCLId(std::vector& hccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueHCCLID(std::vector& hccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + template + std::shared_ptr PointToPoint( + std::vector& tensors, // NOLINT + Fn fn, int dst_rank, CommType op_type); + + void CreateHCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc index 18920d06f38543cc3f7aeb045e7c3058143e006e..ba039385a74ba45aa1f33ba38138d8e5213f2e00 100644 --- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc +++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc @@ -24,10 +24,14 @@ limitations under the License. */ #include "paddle/fluid/distributed/fleet_executor/task_node.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(fill_constant); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/store/tcp_store.cc b/paddle/fluid/distributed/store/tcp_store.cc index eb98c89c99e47f6a36c272b7fffb69d65ddf2f0a..b0d5add49565ffb19762778ddd44a388b140c0ee 100644 --- a/paddle/fluid/distributed/store/tcp_store.cc +++ b/paddle/fluid/distributed/store/tcp_store.cc @@ -136,10 +136,6 @@ void MasterDaemon::run() { } for (size_t i = 1; i < fds.size(); i++) { - VLOG(0) << "fds.size:" << fds.size(); - VLOG(0) << "fds.size-i:" << i; - VLOG(0) << "fds[i].revents:" << fds[i].revents; - try { if (fds[i].revents == 0) { continue; diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 8cb69caf66369655ce751163420b3fcec80dd833..698a698fc6d18492faac771e6e0e079a35953504 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,4 +1,4 @@ -set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps dygraph_function dygraph_node) @@ -10,11 +10,11 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) -cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api) +cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi_api phi_tensor) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) -cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api) -cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) +cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi_api phi_tensor) +cc_library(utils SRCS utils.cc DEPS phi_api phi_tensor global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) add_subdirectory(tests) diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 2fc846cccc22e8937f8865a5063c77321941582a..dc79a8a45a246798551a0bcce8c487f67183220b 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -47,6 +47,9 @@ std::unordered_map> static std::unordered_map operators_with_attrs = {}; +/* --- Black Ops list that's NO NEED to apply code generation --- */ +static std::unordered_set black_ops_list = {"run_program"}; + static std::string LegalizeVariableName(const std::string& var_name) { std::string ret = var_name; std::replace(ret.begin(), ret.end(), '-', '_'); // replace all '-' to '_' @@ -73,12 +76,6 @@ static bool IgnoreGradAttribute(const std::string& op_type, } static void PrepareAttrMapForOps() { - // Handle "run_program_op" - static framework::ProgramDesc fake_prog; - operators_with_attrs["run_program"] = {}; - operators_with_attrs["run_program"]["global_block"] = - fake_prog.MutableBlock(0); - // Handle "fused_elemwise_add_activation" std::vector functor_list = {"a", "b"}; operators_with_attrs["fused_elemwise_add_activation"] = {}; @@ -2349,6 +2346,9 @@ static void DygraphCodeGeneration(const std::string& output_dir) { if (!CheckOpProto(op_proto)) continue; const std::string& op_type = op_proto->type(); + if (black_ops_list.count(op_type)) { + continue; + } /* ----------------------------- */ /* ---- Collect Information ---- */ diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt index c6bca01205e19c58d5924f4e9d60bb76164fee2b..53af6c1048d2454b1e9f375b837103930026ae54 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt @@ -1,5 +1,5 @@ -set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml") -set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml") +set(api_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_api.yaml") +set(backward_yaml_path "${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml,${PADDLE_SOURCE_DIR}/python/paddle/utils/code_gen/sparse_bw_api.yaml") set(tmp_forwards_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc") set(tmp_forwards_h_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.h") set(tmp_nodes_cc_path "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/backwards/tmp_nodes.cc") diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index d1e208541537c8eddf69862aaabbf5fcf6d2a009..967891fe5227dcd6129c0ef1808fba7720711568 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -23,12 +23,13 @@ core_ops_returns_info = {} core_ops_args_info = {} core_ops_args_type_info = {} +namespace = "" yaml_types_mapping = { - 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ + 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ - 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', + 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', 'Tensor[Tensor[]]' : 'std::vector>', @@ -125,6 +126,7 @@ def GetAutoGradMetaVectorName(string): def ReadFwdFile(filepath): f = open(filepath, 'r') contents = yaml.load(f, Loader=yaml.FullLoader) + f.close() return contents @@ -133,9 +135,13 @@ def ReadBwdFile(filepath): contents = yaml.load(f, Loader=yaml.FullLoader) ret = {} for content in contents: - assert 'backward_api' in content.keys() - api_name = content['backward_api'] + if 'backward_api' in content.keys(): + api_name = content['backward_api'] + else: + assert False + ret[api_name] = content + f.close() return ret @@ -608,16 +614,23 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, returns_str += f"return returns;\n" grad_node_name = GetGradNodeName(fwd_api_name) + + if len(namespace) > 0: + grad_api_namespace = f"paddle::experimental::{namespace}" + else: + grad_api_namespace = f"paddle::experimental" + FUNCTION_TEMPLATE = """ std::vector> {}::operator()(const std::vector>& grads) {{ // Call grad_api function - auto grad_api_returns = paddle::experimental::{}({}); + auto grad_api_returns = {}::{}({}); {} }} """ node_definition_str = FUNCTION_TEMPLATE.format( - grad_node_name, bwd_api_name, grad_api_args_str, returns_str) + grad_node_name, grad_api_namespace, bwd_api_name, grad_api_args_str, + returns_str) return node_definition_str @@ -671,7 +684,7 @@ def GenerateNodeCreationCodes( else: # Tuple api_result if IsPlainTensorType(rtype): - outputs_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);" else: assert IsVectorTensorType(rtype) output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result[{pos}]);\n" @@ -699,18 +712,24 @@ def GenerateNodeCreationCodes( # SetTensorWrappers set_tensor_wrappers_list = [] - for name, (_, is_fwd_input, _) in backward_fwd_input_map.items(): + for name, (atype, is_fwd_input, pos) in backward_fwd_input_map.items(): is_optional = (name in optional_inputs) + if is_fwd_input: if is_optional: set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" else: set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: + if IsVectorTensorType(atype): + tw_name = f"api_result[{pos}]" + else: + tw_name = f"api_result" + if is_optional: - set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" set_tensor_wrappers_list.append(set_tensor_wrappers) set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) @@ -850,7 +869,11 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, function_name = fwd_api_name else: function_name = fwd_api_name + "_intermediate" - forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" + + if len(namespace) > 0: + forward_call_str = f"auto api_result = paddle::experimental::{namespace}::{function_name}({inputs_call_args_str});" + else: + forward_call_str = f"auto api_result = paddle::experimental::{function_name}({inputs_call_args_str});" # Get return type list & outputs num_outputs = len(forward_outputs_position_map.keys()) - len( @@ -1000,7 +1023,9 @@ def GenerateNodeCCFile(filepath, node_definition_str): #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/phi/api/include/sparse_api.h" """ file_contents += node_definition_str with open(filepath, 'a') as f: @@ -1021,11 +1046,12 @@ def GenerateNodeHFile(filepath, node_declaration_str): def GenerateForwardCCFile(filepath, forward_definition_str): file_contents = """ +#include "paddle/phi/api/lib/dygraph_api.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" - """ file_contents += GenerateCoreOpInfoDefinition() @@ -1042,6 +1068,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): #include "paddle/phi/api/all.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/eager/to_static/run_program_op_func.h" """ file_contents += GenerateCoreOpInfoDeclaration() @@ -1053,134 +1080,184 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - backward_yaml_path = args.backward_yaml_path - - fwd_api_list = ReadFwdFile(api_yaml_path) - grad_api_dict = ReadBwdFile(backward_yaml_path) + api_yaml_paths = args.api_yaml_path.split(",") + backward_yaml_paths = args.backward_yaml_path.split(",") # Generate per Dygraph API node_declaration_str = "" node_definition_str = "" forward_definition_str = "" forward_declaration_str = "" - for fwd_api in fwd_api_list: - # We only generate Ops with grad - if 'backward' not in fwd_api.keys(): - continue - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - assert 'backward' in fwd_api.keys() - - no_need_buffer_set = set() - if 'no_need_buffer' in fwd_api.keys(): - no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer']) - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - bwd_api_name = fwd_api['backward'] - assert bwd_api_name in grad_api_dict.keys() - bwd_api = grad_api_dict[bwd_api_name] - - assert 'args' in bwd_api.keys() - assert 'output' in bwd_api.keys() - assert 'forward' in bwd_api.keys() - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - bwd_forward_str = bwd_api['forward'] - bwd_args_str = bwd_api['args'] - bwd_returns_str = bwd_api['output'] - - # Collect Forward Inputs/Outputs - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( - bwd_forward_str) - print("Parsed Forward Inputs List: ", forward_inputs_list) - print("Prased Forward Attrs List: ", forward_attrs_list) - print("Parsed Forward Returns List: ", forward_returns_list) - - intermediate_outputs = [] - if 'intermediate' in fwd_api.keys(): - intermediate_outputs = ParseIntermediate(fwd_api['intermediate']) - - IntermediateValidationCheck(intermediate_outputs, forward_returns_list) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", orig_forward_inputs_list) - print("Prased Original Forward Attrs List: ", orig_forward_attrs_list) - print("Parsed Original Forward Returns List: ", - orig_forward_returns_list) - - # Forward Validation Checks - ForwardsValidationCheck(forward_inputs_list, forward_attrs_list, - forward_returns_list, orig_forward_inputs_list, - orig_forward_attrs_list, - orig_forward_returns_list) - - # Parse Backward Inputs/Outputs - backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( - bwd_args_str, bwd_returns_str) - print("Parsed Backward Inputs List: ", backward_inputs_list) - print("Prased Backward Attrs List: ", backward_attrs_list) - print("Parsed Backward Returns List: ", backward_returns_list) - - # Determine Forward Inputs/Outputs Position - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - # SlotName Matching - backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( - backward_inputs_list, backward_returns_list, - forward_inputs_position_map, forward_outputs_position_map) - print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) - print("Generated Backward Grad Input Map: ", backward_grad_input_map) - print("Generated Backward Grad Output Map: ", backward_grad_output_map) - - # Backward Validation Check - BackwardValidationCheck(backward_fwd_input_map, backward_grad_input_map, - backward_attrs_list) - - # Node Declaration Generation - node_declaration_str += GenerateNodeDeclaration( - fwd_api_name, backward_fwd_input_map, backward_attrs_list, - no_need_buffer_set) - print("Generated Node Declaration: ", node_declaration_str) - - node_definition_str += GenerateNodeDefinition( - fwd_api_name, bwd_api_name, backward_fwd_input_map, - backward_grad_input_map, backward_grad_output_map, - backward_attrs_list) - print("Generated Node Definition: ", node_definition_str) - - # Node Definition Generation - definition_declaration_pair = GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs) - print("Generated Forward Definition: ", forward_definition_str) - print("Generated Forward Declaration: ", forward_declaration_str) - forward_definition_str += definition_declaration_pair[0] - forward_declaration_str += definition_declaration_pair[1] - - # For python-level API dispatch - CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, - forward_attrs_list) + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + backward_yaml_path = backward_yaml_paths[i] + + if "sparse" in api_yaml_path: + assert "sparse" in backward_yaml_path + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + grad_api_dict = ReadBwdFile(backward_yaml_path) + + yaml_forward_definition_str = "" + yaml_forward_declaration_str = "" + yaml_node_declaration_str = "" + yaml_node_definition_str = "" + for fwd_api in fwd_api_list: + # We only generate Ops with grad + if 'backward' not in fwd_api.keys(): + continue + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + assert 'backward' in fwd_api.keys() + + no_need_buffer_set = set() + if 'no_need_buffer' in fwd_api.keys(): + no_need_buffer_set = ParseNoNeedBuffer(fwd_api[ + 'no_need_buffer']) + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + bwd_api_name = fwd_api['backward'] + assert bwd_api_name in grad_api_dict.keys() + bwd_api = grad_api_dict[bwd_api_name] + + assert 'args' in bwd_api.keys() + assert 'output' in bwd_api.keys() + assert 'forward' in bwd_api.keys() + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + bwd_forward_str = bwd_api['forward'] + bwd_args_str = bwd_api['args'] + bwd_returns_str = bwd_api['output'] + + # Collect Forward Inputs/Outputs + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForwardFromBackward( + bwd_forward_str) + print("Parsed Forward Inputs List: ", forward_inputs_list) + print("Prased Forward Attrs List: ", forward_attrs_list) + print("Parsed Forward Returns List: ", forward_returns_list) + + intermediate_outputs = [] + if 'intermediate' in fwd_api.keys(): + intermediate_outputs = ParseIntermediate(fwd_api[ + 'intermediate']) + + IntermediateValidationCheck(intermediate_outputs, + forward_returns_list) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + orig_forward_inputs_list, orig_forward_attrs_list, orig_forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", + orig_forward_inputs_list) + print("Prased Original Forward Attrs List: ", + orig_forward_attrs_list) + print("Parsed Original Forward Returns List: ", + orig_forward_returns_list) + + # Forward Validation Checks + ForwardsValidationCheck( + forward_inputs_list, forward_attrs_list, forward_returns_list, + orig_forward_inputs_list, orig_forward_attrs_list, + orig_forward_returns_list) + + # Parse Backward Inputs/Outputs + backward_inputs_list, backward_attrs_list, backward_returns_list = ParseYamlBackward( + bwd_args_str, bwd_returns_str) + print("Parsed Backward Inputs List: ", backward_inputs_list) + print("Prased Backward Attrs List: ", backward_attrs_list) + print("Parsed Backward Returns List: ", backward_returns_list) + + # Determine Forward Inputs/Outputs Position + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + # SlotName Matching + backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map = SlotNameMatching( + backward_inputs_list, backward_returns_list, + forward_inputs_position_map, forward_outputs_position_map) + print("Generated Backward Fwd Input Map: ", backward_fwd_input_map) + print("Generated Backward Grad Input Map: ", + backward_grad_input_map) + print("Generated Backward Grad Output Map: ", + backward_grad_output_map) + + # Backward Validation Check + BackwardValidationCheck(backward_fwd_input_map, + backward_grad_input_map, + backward_attrs_list) + + # Node Declaration Generation + yaml_node_declaration_str += GenerateNodeDeclaration( + fwd_api_name, backward_fwd_input_map, backward_attrs_list, + no_need_buffer_set) + print("Generated Node Declaration: ", node_declaration_str) + + yaml_node_definition_str += GenerateNodeDefinition( + fwd_api_name, bwd_api_name, backward_fwd_input_map, + backward_grad_input_map, backward_grad_output_map, + backward_attrs_list) + print("Generated Node Definition: ", node_definition_str) + + # Node Definition Generation + definition_declaration_pair = GenerateForwardDefinition( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs) + print("Generated Forward Definition: ", forward_definition_str) + print("Generated Forward Declaration: ", forward_declaration_str) + yaml_forward_definition_str += definition_declaration_pair[0] + yaml_forward_declaration_str += definition_declaration_pair[1] + + # For python-level API dispatch + CollectCoreOpsInformation(fwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, + forward_attrs_list) + + if len(namespace) > 0: + forward_definition_str += f"""namespace {namespace} {{ + {yaml_forward_definition_str} +}} +""" + + forward_declaration_str += f"""namespace {namespace} {{ + {yaml_forward_declaration_str} +}} +""" + + node_declaration_str += f"""namespace {namespace} {{ + {yaml_node_declaration_str} +}} +""" + + node_definition_str += f"""namespace {namespace} {{ + {yaml_node_definition_str} +}} +""" + + else: + forward_definition_str += yaml_forward_definition_str + forward_declaration_str += yaml_forward_declaration_str + node_declaration_str += yaml_node_declaration_str + node_definition_str += yaml_node_definition_str # Generate Files nodes_h_path = args.nodes_h_path diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index d0506e45eb476c50301f79e787d7272c5425d986..eee32a2c5057d523212a4faa5eca8678e961f417 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,7 +14,7 @@ import os import argparse -from eager_gen import yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap skipped_fwd_api_names = set(["scale"]) @@ -126,16 +126,20 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj }} """ + namespace_str = "" + if len(namespace) > 0: + namespace_str = f"{namespace}::" + if is_forward_only: - fwd_function_name = fwd_api_name + fwd_function_name = "paddle::experimental::" + namespace_str + fwd_api_name else: - fwd_function_name = GetForwardFunctionName(fwd_api_name) + fwd_function_name = namespace_str + GetForwardFunctionName(fwd_api_name) python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, fwd_function_name, dygraph_function_call_str) - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" + python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void)) {namespace_str}eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" return python_c_function_str, python_c_function_reg_str @@ -189,7 +193,7 @@ static PyObject * eager_get_final_state_core_ops_returns_info(PyObject *self) { """ core_ops_infos_registry = """ - ,{\"get_final_state_core_ops_args_info\", + {\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, {\"get_final_state_core_ops_args_type_info\", @@ -218,10 +222,12 @@ def GeneratePythonCWrappers(python_c_function_str, python_c_function_reg_str): #include "pybind11/detail/common.h" #include "paddle/phi/api/all.h" +#include "paddle/phi/api/lib/dygraph_api.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/pybind/op_function_common.h" #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" #include "paddle/fluid/pybind/exception.h" @@ -254,57 +260,80 @@ def GeneratePythonCFile(filepath, python_c_str): if __name__ == "__main__": args = ParseArguments() - api_yaml_path = args.api_yaml_path - fwd_api_list = ReadFwdFile(api_yaml_path) - - python_c_function_list = [] - python_c_function_reg_list = [] - for fwd_api in fwd_api_list: - - # We only generate Ops with grad - is_forward_only = False - if 'backward' not in fwd_api.keys(): - is_forward_only = True - - assert 'api' in fwd_api.keys() - assert 'args' in fwd_api.keys() - assert 'output' in fwd_api.keys() - - fwd_api_name = fwd_api['api'] - fwd_args_str = fwd_api['args'] - fwd_returns_str = fwd_api['output'] - - if fwd_api_name in skipped_fwd_api_names: - continue - - # Parse Dispensable Inputs - optional_inputs = [] - if 'optional' in fwd_api.keys(): - optional_inputs = ParseDispensable(fwd_api['optional']) - - # Collect Original Forward Inputs/Outputs and then perform validation checks - forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( - fwd_args_str, fwd_returns_str) - print("Parsed Original Forward Inputs List: ", forward_inputs_list) - print("Prased Original Forward Attrs List: ", forward_attrs_list) - print("Parsed Original Forward Returns List: ", forward_returns_list) - - forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( - forward_inputs_list, forward_returns_list) - print("Generated Forward Input Position Map: ", - forward_inputs_position_map) - print("Generated Forward Output Position Map: ", - forward_outputs_position_map) - - python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( - fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map, optional_inputs, is_forward_only) - python_c_function_list.append(python_c_function_str) - python_c_function_reg_list.append(python_c_function_reg_str) - print("Generated Python-C Function: ", python_c_function_str) - - python_c_functions_str = "\n".join(python_c_function_list) - python_c_functions_reg_str = ",\n".join(python_c_function_reg_list) + api_yaml_paths = args.api_yaml_path.split(",") + + python_c_functions_reg_str = "" + python_c_functions_str = "" + + for i in range(len(api_yaml_paths)): + api_yaml_path = api_yaml_paths[i] + + if "sparse" in api_yaml_path: + namespace = "sparse" + else: + namespace = "" + + fwd_api_list = ReadFwdFile(api_yaml_path) + + python_c_function_list = [] + python_c_function_reg_list = [] + for fwd_api in fwd_api_list: + + # We only generate Ops with grad + is_forward_only = False + if 'backward' not in fwd_api.keys(): + is_forward_only = True + + assert 'api' in fwd_api.keys() + assert 'args' in fwd_api.keys() + assert 'output' in fwd_api.keys() + + fwd_api_name = fwd_api['api'] + fwd_args_str = fwd_api['args'] + fwd_returns_str = fwd_api['output'] + + if fwd_api_name in skipped_fwd_api_names: + continue + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + + # Collect Original Forward Inputs/Outputs and then perform validation checks + forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( + fwd_args_str, fwd_returns_str) + print("Parsed Original Forward Inputs List: ", forward_inputs_list) + print("Prased Original Forward Attrs List: ", forward_attrs_list) + print("Parsed Original Forward Returns List: ", + forward_returns_list) + + forward_inputs_position_map, forward_outputs_position_map = DetermineForwardPositionMap( + forward_inputs_list, forward_returns_list) + print("Generated Forward Input Position Map: ", + forward_inputs_position_map) + print("Generated Forward Output Position Map: ", + forward_outputs_position_map) + + python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( + fwd_api_name, forward_inputs_position_map, forward_attrs_list, + forward_outputs_position_map, optional_inputs, is_forward_only) + python_c_function_list.append(python_c_function_str) + python_c_function_reg_list.append(python_c_function_reg_str) + print("Generated Python-C Function: ", python_c_function_str) + + # Append Namespace + python_c_functions_reg_str += ",\n".join( + python_c_function_reg_list) + "," + python_c_functions = "\n".join(python_c_function_list) + if len(namespace) > 0: + python_c_functions_str += f"""namespace {namespace} {{ + {python_c_functions} +}} +""" + + else: + python_c_functions_str += python_c_functions python_c_str = GeneratePythonCWrappers(python_c_functions_str, python_c_functions_reg_str) diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc index 8c6eeca9d3d5d80fd5bfe943ef87ba8640ada4f2..384fdcd6f97c4b318341db68cdd88b644d42d22a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_tensor_holder_test.cc @@ -24,6 +24,8 @@ #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT); + // TODO(jiabin): remove nolint here!!! using namespace egr; // NOLINT diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc index 6c4bf9a4f17e6f88503f0a1d6ec2f3029000b6f0..af365322e606ebfaecb7233751cacc6aa1aac423 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc @@ -33,6 +33,14 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc index 14e7ce8cfcfb4dea0907cd128873223c8e5859a2..5b75f1242e69bc5b37dd97467b7c55bfc6bc3871 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc @@ -32,11 +32,19 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + using namespace egr; // NOLINT using namespace egr_utils_api; // NOLINT #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); + TEST(Benchmark, EagerScaleCUDA) { eager_test::InitEnv(paddle::platform::CUDAPlace()); @@ -186,7 +194,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc index 3292de9363696dae30d853980eca6fb1ba1055cc..a9d297c1c64f7b64373237a0500802a5c883aedd 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc @@ -34,6 +34,14 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace paddle { namespace imperative { diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc index e9b7d10070dbf22f10e617d34f143992d19fb659..bd9eaa09ca9a406da943c8a0b0f37b674d5ea3c2 100644 --- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc +++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc @@ -34,8 +34,16 @@ #include "gperftools/profiler.h" #endif +#include "paddle/phi/core/kernel_registry.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); + namespace paddle { namespace imperative { @@ -248,7 +256,7 @@ TEST(Benchmark, FluidMLPCUDA) { USE_OP_ITSELF(scale); USE_OP_ITSELF(matmul_v2); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); #endif // PADDLE_WITH_CUDA || PADDLE_WITH_HIP diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index a4bc56bd606f3fbb0f9152d58acb5c8edeecf905..0c894ed267fcdd08d44d4df08bfaf0554874aebf 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -30,6 +30,10 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(Backward, SingleNodeEmptyGrad) { diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index 524872b2e55638d25697388aa50724f49f6e3818..36594f1aac8cdb131bb77f1396dca19a0c2e8cc0 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(CrossBatchAccumulation, SingleScaleNode) { diff --git a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc index 49bbfc77741a5b82ac9a564e25b484e5dabf77a7..dc44d95daac1d9109bbf2a1d04a8a47b081cead9 100644 --- a/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc +++ b/paddle/fluid/eager/tests/task_tests/forward_autograd_test.cc @@ -27,6 +27,10 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(Forward, SingleNode) { diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 5a7bafb2fe37051c0ad054c130d77dd6e05319d2..f7fa642ea8dd17d20816e74c9bfb4cd92b184b4a 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -30,6 +30,13 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT); +#endif + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc index 4b7077b13bdd6c48a0a3846656bd3a6337eb9f80..2a5ad53204a6201149bec0b3dac0fa3baf441f2e 100644 --- a/paddle/fluid/eager/tests/task_tests/generated_test.cc +++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc @@ -30,6 +30,12 @@ #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { TEST(Generated, Sigmoid) { diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index 9cda961741f55e9b4b7fc8dac61fe4a7c96567cf..d546df4ed087a99a28096a5336fab3826991534a 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -31,6 +31,10 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index 15b2a62dca751859882e82d46acaa46f27c2c518..56813c498d2410caa452da7a334c393b230c65bf 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -27,6 +27,12 @@ #include "paddle/fluid/eager/hooks.h" #include "paddle/phi/core/kernel_registry.h" +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); + namespace egr { paddle::experimental::Tensor hook_function( diff --git a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc index ea821d195099f3d632e0d1b2d4937bac812563c8..24e5da060111f083ef9b65574e75295fa07f8f43 100644 --- a/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/tensor_utils_test.cc @@ -23,6 +23,10 @@ #include "paddle/fluid/eager/tests/test_utils.h" #include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT); + namespace egr { TEST(TensorUtils, Test) { diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h new file mode 100644 index 0000000000000000000000000000000000000000..6f8bccd64e45f015a5c1aed44fbfdfc6f68660f1 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -0,0 +1,82 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/eager/to_static/run_program_op_node.h" +#include "paddle/fluid/eager/utils.h" + +inline void run_program_dygraph_function( + const std::vector& x, + const std::vector& params, + std::vector& out, // NOLINT + std::vector& step_scope, // NOLINT + std::vector& dout, // NOLINT + const paddle::framework::AttributeMap& attrs) { + VLOG(2) << "start run run_program"; + // Call forward function + RunProgramAPI(x, params, out, step_scope, dout, attrs); + VLOG(2) << "start run run_program grad"; + + // Prepare Autograd Meta + auto deref_out = details::DereferenceTensors(out); + std::vector p_autograd_x = + egr::EagerUtils::nullable_autograd_meta(x); + std::vector p_autograd_params = + egr::EagerUtils::nullable_autograd_meta(params); + std::vector p_autograd_outs = + egr::EagerUtils::nullable_autograd_meta(deref_out); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, &p_autograd_x, &p_autograd_params); + + if (require_any_grad) { + std::vector out_names; + for (auto& t : deref_out) { + out_names.emplace_back(t.name()); + } + + egr::EagerUtils::PassStopGradient(false, &p_autograd_outs); + // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad]) + auto grad_node = std::make_shared(1, 2); + + grad_node->SetFwdOutNames(out_names); + // Set Attributes + grad_node->SetAttrMap(attrs); + // Set TensorWrappers + grad_node->SetFwdX(x); + grad_node->SetFwdParams(params); + grad_node->SetStepScope(step_scope); + + // Set Grad out rank as same as fwd input and set stop gradient to bwd + grad_node->SetGradOutMeta(&p_autograd_x, /*slot id*/ 0); + grad_node->SetGradOutMeta(&p_autograd_params, /*slot id*/ 1); + + grad_node->SetGradInMeta(&p_autograd_outs, 0); + // Set Next Edges + grad_node->AddEdges(&p_autograd_x, /*slot id*/ 0); + grad_node->AddEdges(&p_autograd_params, /*slot id*/ 1); + + egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0); + + // Set History for output set current Grad Node for + egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node); + egr::EagerUtils::CheckAndRetainGrad(deref_out); + } +} diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h new file mode 100644 index 0000000000000000000000000000000000000000..ae5d86664a346fd8a1d877f9e1dd74f687302595 --- /dev/null +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -0,0 +1,468 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/tensor_wrapper.h" + +#include "paddle/fluid/operators/run_program_op.h" +#include "paddle/fluid/platform/enforce.h" + +namespace details { +using Tensor = paddle::experimental::Tensor; + +static std::vector DereferenceTensors( + const std::vector &tensor_ptr) { + std::vector res; + for (auto *t : tensor_ptr) { + res.emplace_back(*t); + } + return res; +} + +static std::vector GetTensorsName(const std::vector &ins) { + std::vector in_names; + for (auto &in_t : ins) { + in_names.emplace_back(in_t.name()); + } + return in_names; +} + +static std::vector GetTensorsName( + const std::vector &ins) { + std::vector in_names; + for (auto *in_t : ins) { + in_names.emplace_back(in_t->name()); + } + return in_names; +} + +static void CheckInputVarStatus(const Tensor &tensor) { + PADDLE_ENFORCE_EQ( + tensor.defined() && phi::DenseTensor::classof(tensor.impl().get()), true, + paddle::platform::errors::InvalidArgument( + "The input tensor %s of " + "RunProgram(Grad)Op holds " + "wrong type. Expect type is DenseTensor.", + tensor.name())); + + PADDLE_ENFORCE_EQ(tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in input tensor %s of " + "RunProgram(Grad)Op " + "is not initialized.", + tensor.name())); +} + +static void CheckOutputVarStatus(const paddle::framework::Variable &src_var, + const Tensor &dst_tensor) { + auto name = dst_tensor.name(); + PADDLE_ENFORCE_EQ(dst_tensor.defined(), true, + paddle::platform::errors::InvalidArgument( + "dst_tensor shall be defined.")); + + if (phi::DenseTensor::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::DenseTensor::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensor %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is DenseTensor", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's internal " + "scope is not initialized.", + name)); + } else if (phi::SelectedRows::classof(dst_tensor.impl().get())) { + auto &src_tensor = src_var.Get(); + PADDLE_ENFORCE_EQ(phi::SelectedRows::classof(&src_tensor), true, + paddle::platform::errors::InvalidArgument( + "The output tensodfr %s get from " + "RunProgram(Grad)Op's internal scope holds " + "wrong type. Expect type is SelectedRows", + name)); + PADDLE_ENFORCE_EQ(src_tensor.initialized(), true, + paddle::platform::errors::InvalidArgument( + "The tensor in output tensor %s get from " + "RunProgram(Grad)Op's " + "internal scope is not initialized.", + name)); + + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "The RunProgram(Grad)Op only support output " + "variable of type LoDTensor or SelectedRows", + name)); + } +} + +static void ShareTensorsIntoScope(const std::vector &tensors, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + auto name = tensors[i].name(); + if (name == "Fake_var" || !tensors[i].is_initialized()) { + continue; + } + auto *var = scope->Var(name); + CheckInputVarStatus(tensors[i]); + // share tensor + auto tensor_base = tensors[i].impl(); + if (phi::DenseTensor::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto *dst_tensor = var->GetMutable(); + auto t = std::dynamic_pointer_cast(tensor_base); + *dst_tensor = *t; + } + } +} + +static void ShareTensorsFromScope( + const std::vector &tensors, + const paddle::framework::BlockDesc &global_block, + paddle::framework::Scope *scope) { + for (size_t i = 0; i < tensors.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't find them in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. + auto &name = tensors[i]->name(); + if (name == paddle::framework::kEmptyVarName || name == "Fake_var" || + !global_block.HasVar(name)) { + VLOG(2) << "find tensor name is " << name << ", skip it!"; + continue; + } + // NOTE: Here skip not found var is dangerous, if a bug is caused here, + // the result is grad calculation error, which will be very hidden! + auto *var = scope->FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var, paddle::platform::errors::NotFound( + "The output tensor %s is not in " + "RunProgram(Grad)Op'" + "s internal scope.", + name)); + CheckOutputVarStatus(*var, *tensors[i]); + // share tensor + // TODO(dev): Determine Tensor type by scope.var + // auto tensor_base = tensors[i]->impl(); + // if (phi::DenseTensor::classof(tensor_base.get())) { + if (var->IsType()) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + VLOG(2) << "share " << name << " from scope"; + *dst_tensor = src_tensor; + } else if (var->IsType()) { + // } else if (phi::SelectedRows::classof(tensor_base.get())) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + *dst_tensor = src_tensor; + } + } +} + +} // namespace details + +inline void RunProgramAPI( + const std::vector &x, + const std::vector ¶ms, + std::vector &out, // NOLINT + std::vector &step_scope, // NOLINT + std::vector &dout, // NOLINT + const paddle::framework::AttributeMap &attrs) { + VLOG(2) << "RunProgramOpKernel Compute"; + auto start_op_index = BOOST_GET_CONST(int64_t, attrs.at("start_op_index")); + auto end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + auto is_test = BOOST_GET_CONST(bool, attrs.at("is_test")); + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + + // NOTE(chenweihang): In order not to add new variable type, use vector + // here. Originally, here can use scope directly. + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + // Step 2. prepare executor and init persistable variables + + // NOTE(Aurelius84): While training some models, forward can be called many + // times and then apply backpropagation all at once, such as Reinforcement + // Learning. Tensor data in multi-step training should be saved into single + // scope separately. Otherwise, the gradients can be miscalculated because + // always using the Tensor data of the last step in forward. + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + VLOG(2) << "The number of sub scopes before forward: " + << out_scope_vec->front()->kids().size(); + paddle::framework::Scope &scope = global_inner_scope->NewScope(); + + // share input_vars & parameters into scope + details::ShareTensorsIntoScope(x, &scope); + details::ShareTensorsIntoScope(params, &scope); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto input_names = details::GetTensorsName(x); + auto output_names = details::GetTensorsName(out); + auto dout_names = details::GetTensorsName(dout); + auto *program = global_block->Program(); + + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad=*/false, program_id, &scope); + auto ¶llel_executor = cache_info.first; + // all out_vars are skip_eager_var + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, false); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, input_names); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + output_names.begin(), output_names.end()); + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + dout_names.begin(), dout_names.end()); + paddle::framework::details::ParseSafeEagerDeletionSkipVars( + *program, end_op_index, output_names, &skip_eager_delete_vars); + } + + // Step 3. run ops + parallel_executor->RunWithoutFetch(skip_eager_delete_vars); + } + // Step 4. Get Output + details::ShareTensorsFromScope(out, *global_block, &scope); + details::ShareTensorsFromScope(dout, *global_block, &scope); + + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + // Step 5. Drop all children scopes while testing. + if (is_test) { + out_scope_vec->front()->DropKids(); + } + VLOG(2) << "The number of sub scopes after forward: " + << out_scope_vec->front()->kids().size(); + // #ifdef PADDLE_WITH_MKLDNN + // if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); + // #endif +} + +inline void RunProgramGradAPI( + const std::vector &x, + const std::vector ¶ms, + const std::vector &out_grad, + const std::vector &step_scope, // NOLINT + const paddle::framework::AttributeMap &attrs, + std::vector &x_grad, // NOLINT + std::vector ¶ms_grad // NOLINT + ) { + // if all output vars are set to stop_gradient, grad op no need to executed + if (x_grad.empty() && params_grad.empty()) return; + + // TODO(dev): Remove this line hard code. And need to deal with the out_grad + // name problem. + // const_cast(out_grad[0]) + // .set_name("matmul_v2_0.tmp_0@GRAD"); + + auto *global_block = + BOOST_GET_CONST(paddle::framework::BlockDesc *, attrs.at("global_block")); + auto orig_end_op_index = BOOST_GET_CONST(int64_t, attrs.at("end_op_index")); + + auto program_id = BOOST_GET_CONST(int64_t, attrs.at("program_id")); + // NOTE: skip `shape` and `fill_constant` op created by + // fluid.backward.gradients, one forward output will generate one `shape` + // and `fill_constant` + int64_t start_op_index = orig_end_op_index + (out_grad.size() * 2); + int64_t end_op_index = global_block->OpSize(); + + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + auto sub_scope_num = global_inner_scope->kids().size(); + VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num; + PADDLE_ENFORCE_GT(sub_scope_num, 0, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should hold at " + "least one sub scope.")); + + auto &scope = *(global_inner_scope->kids().front()); + const auto &place = egr::Controller::Instance().GetExpectedPlace(); + + if (end_op_index > start_op_index) { + auto out_grad_names = details::GetTensorsName(out_grad); + // NOTE: after PR22939 [Add double grad] merged, the grad op maker's + // SetOutput will set to None if the input var stop_gradient=True, + // it will cause an NotFound error when ctx.OutputNames() is called + std::vector x_grad_names; + std::vector param_grad_names; + if (!x_grad.empty()) { + x_grad_names = details::GetTensorsName(x_grad); + } + if (!params_grad.empty()) { + param_grad_names = details::GetTensorsName(params_grad); + } + + // Step 2. prepare executor and scope + auto *program = global_block->Program(); + auto cache_info = paddle::framework::GetExecutorInfoFromCache( + *program, place, start_op_index, end_op_index, + /*is_grad*/ true, program_id, &scope); + auto ¶llel_executor = cache_info.first; + + auto &skip_eager_delete_vars = + paddle::framework::ExecutorInfoCache::Instance().SkipEagerDeleteVars( + program_id, true); + if (cache_info.second /*is_new_created*/) { + parallel_executor->SkipMemoryReuse(/*scope_idx=*/0, out_grad_names); + + skip_eager_delete_vars.insert(skip_eager_delete_vars.end(), + x_grad_names.begin(), x_grad_names.end()); + paddle::framework::details::AppendSkipDeletionVars( + param_grad_names, &skip_eager_delete_vars); + } + + details::ShareTensorsIntoScope(out_grad, &scope); + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + + // Step 3. run ops + parallel_executor->RunWithoutFetch( + /*skip_eager_delete_vars=*/skip_eager_delete_vars); + } + + // Step 4. get outputs + details::ShareTensorsFromScope(x_grad, *global_block, &scope); + details::ShareTensorsFromScope(params_grad, *global_block, &scope); + + // Step5. drop current scope + // global_inner_scope->DeleteScope(&scope); + VLOG(2) << "The number of sub scopes after backward: " + << global_inner_scope->kids().size(); +} + +class GradNodeRunProgram : public egr::GradNodeBase { + public: + GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + + ~GradNodeRunProgram() override = default; + // Functor: perform backward computations + virtual std::vector> operator()( + const std::vector> &grads) + override { + VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; + PADDLE_ENFORCE_EQ( + grads.size(), 1, + paddle::platform::errors::InvalidArgument( + "The out_grads.size() of RunProgramGradOp should be equal to 1.")); + + VLOG(3) << "out_grads[0].size() : " << grads[0].size(); + std::vector x_grad; + std::vector params_grad; + ConstructGradTensors(x_, &x_grad); + ConstructGradTensors(params_, ¶ms_grad); + std::vector x_grad_ptr; + std::vector params_grad_ptr; + for (auto &i : x_grad) { + x_grad_ptr.emplace_back(&i); + } + for (auto &i : params_grad) { + params_grad_ptr.emplace_back(&i); + } + + // auto x_grad_ptr = ConstructGradTensors(x_); + // auto params_grad_ptr = ConstructGradTensors(params_); + + PADDLE_ENFORCE_EQ( + grads[0].size(), fwd_out_names_.size(), + paddle::platform::errors::InvalidArgument( + "The grads[0].size() and fwd_out_names_.size() should be equal.")); + for (size_t i = 0; i < fwd_out_names_.size(); ++i) { + const_cast(grads[0][i]) + .set_name(fwd_out_names_[i] + "@GRAD"); + } + + RunProgramGradAPI(x_, params_, grads[0], step_scope_, attrs_, x_grad_ptr, + params_grad_ptr); + VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; + return {x_grad, params_grad}; + // return {x_grad, details::DereferenceTensors(params_grad_ptr)}; + } + + // SetAttrMap + void SetAttrMap(const paddle::framework::AttributeMap &attrs) { + attrs_ = attrs; + } + + void SetFwdX(const std::vector &tensors) { + x_ = tensors; + } + + void SetFwdParams(const std::vector &tensors) { + params_ = tensors; + } + + void SetStepScope(const std::vector &scopes) { + step_scope_ = scopes; + } + + void SetFwdOutNames(std::vector out_names) { + fwd_out_names_ = out_names; + } + + protected: + void ConstructGradTensors( + const std::vector &fwd_tensors, + std::vector *grad_tensors) { + // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, + // such as: name, tensor type(DenseTensor or SelectedRows). + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + grad_tensors->emplace_back(fwd_t.impl()); + auto &grad_t = grad_tensors->back(); + grad_t.set_name(fwd_t.name() + "@GRAD"); + } + } + + void ConstructGradTensors( + const std::vector &fwd_tensors) { + VLOG(3) << "fwd_tensors.size(): " << fwd_tensors.size(); + for (auto &fwd_t : fwd_tensors) { + auto grad_tesnor = egr::EagerUtils::unsafe_autograd_meta(fwd_t)->Grad(); + grad_tesnor.set_name(fwd_t.name() + "@GRAD"); + } + } + + private: + // TensorWrappers + std::vector x_; + std::vector params_; + std::vector step_scope_; + + std::vector fwd_out_names_; + + // Attribute Map + paddle::framework::AttributeMap attrs_; +}; diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index e486799495c7abe18ef7a85eca45e1b7ddf41608..aa92a3b2226c1fca1fa7326e76ef29b0b38cd8d6 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -443,7 +443,7 @@ cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framewo #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) -set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator phi_custom_kernel) +set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 7232a707916dd5f0795c04cff8137c5e88132d42..29c7f5d0ce73cbf1af18e6f5869d59d2200917ad 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -90,6 +90,8 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext { bool IsForInferShape() const override { return true; } + bool IsRuntime() const override { return ctx_.IsRuntime(); } + private: const InferShapeContext& ctx_; }; @@ -232,16 +234,8 @@ class CompatMetaTensor : public phi::MetaTensor { } } - void share_meta(const MetaTensor& meta_tensor) override { + void share_dims(const MetaTensor& meta_tensor) override { set_dims(meta_tensor.dims()); - set_dtype(meta_tensor.dtype()); - // VarDesc doesn't contains layout, so we cannot share layout - // set_layout(meta_tensor.layout()); - - // special case 1: share lod of LoDTensor - share_lod(meta_tensor); - - // special case 2: share height and rows of SelectedRows in runtime if (is_runtime_) { auto* var = BOOST_GET(Variable*, var_); if (var->IsType()) { @@ -254,6 +248,16 @@ class CompatMetaTensor : public phi::MetaTensor { } } + void share_meta(const MetaTensor& meta_tensor) override { + set_dtype(meta_tensor.dtype()); + // VarDesc doesn't contains layout, so we cannot share layout + // set_layout(meta_tensor.layout()); + + // special case 1: share lod of LoDTensor + share_lod(meta_tensor); + share_dims(meta_tensor); + } + private: const LoD& GetRuntimeLoD() const { auto* var = BOOST_GET_CONST(Variable*, var_); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index d33dc7f49feb0f4c9e585d13186d65b6c2d618c0..636a594a657cb0744aac161d928ff9078b1f92bc 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -20,12 +20,15 @@ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(scale); USE_OP(elementwise_mul); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT); + DECLARE_double(eager_delete_tensor_gb); namespace paddle { diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 0a95444f852dd0abdd150d04dc7536e26151c218..d578ada0db00fed85f7b4f25f1483169c72c2c0b 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -15,8 +15,9 @@ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass.h" #include -#include #include + +#include #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -27,7 +28,7 @@ USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(leaky_relu); USE_OP_DEVICE_KERNEL(leaky_relu, MKLDNN); USE_OP(gelu); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(tanh); USE_OP_DEVICE_KERNEL(tanh, MKLDNN); diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc index 62d87b6917e4059f08dcdf5ccb4eed6434211e43..219aae71127ed8963b4bfe4e8ee5e7259dbf7d02 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc @@ -37,7 +37,7 @@ USE_OP(elementwise_mul); USE_OP(softmax_with_cross_entropy); USE_OP_ITSELF(reduce_mean); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP(reduce_mean_grad); USE_OP_ITSELF(reshape2_grad); USE_OP(softmax_with_cross_entropy_grad); @@ -46,7 +46,7 @@ USE_OP(matmul_grad); USE_OP(square); USE_OP(transpose2_grad); USE_OP(concat_grad); -USE_OP(elementwise_mul_grad); +USE_OP_ITSELF(elementwise_mul_grad); USE_OP(sigmoid_grad); USE_OP(tanh_grad); USE_OP(sum); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index eff6d9a9102d2b486bdab66755071a2382cdb404..f8e30c1ee294ecf692e2992b6123232ba1c8bd7d 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -539,6 +539,20 @@ bool ExecutionContext::HasInput(const std::string& name) const { return var != nullptr; } +bool ExecutionContext::HasInputs(const std::string& name) const { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { + return false; + } + for (const auto* input : it->second) { + if (input == nullptr) { + return false; + } + } + return true; +} + bool ExecutionContext::HasOutput(const std::string& name) const { auto* var = OutputVar(name); return var != nullptr; @@ -2189,6 +2203,51 @@ void OperatorWithKernel::BuildPhiKernelContext( std::move(experimental::MakePhiScalarFromVar(*ins_vector.front()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = Attrs().at(attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + pt_kernel_context->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = Attrs().at(attr_names[i]); @@ -2212,7 +2271,11 @@ void OperatorWithKernel::BuildPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + pt_kernel_context->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e33d4feb82a9e7a92c3dabea0ccc5fe370afda66..1a1171f1dba4d794796ef1421fe386f60a0e587d 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -295,6 +295,8 @@ class ExecutionContext { virtual bool HasInput(const std::string& name) const; + virtual bool HasInputs(const std::string& name) const; + virtual bool HasOutput(const std::string& name) const; virtual size_t InputSize(const std::string& name) const { @@ -449,7 +451,7 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext { : ctx_(ctx) {} bool HasInput(const std::string& name) const override { - return ctx_.HasInput(name); + return ctx_.HasInputs(name); } bool HasOutput(const std::string& name) const override { diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index bf9d1baaf394f05d125563311dd2047383373834..47dffd47b7cbbf4a37e6715b40d41024330bc679 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -675,7 +675,7 @@ TEST(BuildCinnPassTest, NoNeedBufferInput) { USE_PASS(build_cinn_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); -USE_OP(relu_grad); +USE_OP_ITSELF(relu_grad); USE_OP_ITSELF(elementwise_add_grad); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc index e8badab27b9b97aade81bf496ce211485f924757..cdccc4c5546900a141a084281f419c2940b23817 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -301,5 +301,5 @@ TEST(CinnCompilerTest, Compile) { USE_PASS(build_cinn_pass); USE_PASS(graph_viz_pass); USE_OP(mul); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index fe5ac73b0046915c4a52087ed792925b0b0ed200..fbc47f81fd33169f54aeb2c251f9b6c90cb44637 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -133,6 +133,11 @@ class DygraphExecutionContext : public framework::ExecutionContext { return (it != var_map_in_.end() && it->second.size() > 0); } + bool HasInputs(const std::string& name) const override { + auto it = var_map_in_.find(name); + return (it != var_map_in_.end() && it->second.size() > 0); + } + bool HasOutput(const std::string& name) const override { auto it = var_map_out_.find(name); return (it != var_map_out_.end() && it->second.size() > 0); diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 2317bfdd7c0d5ee94e91e081da47177625f5bfd8..bae49fb381a475dd8227d1dc855a6db28c9cd273 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -247,6 +247,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif #ifdef PADDLE_WITH_XPU_KP + expected_kernel_key.place_ = platform::XPUPlace(); bool use_xpu_kp_kernel_rt = FLAGS_run_kp_kernel && paddle::platform::is_xpu_kp_support_op(op.Type(), expected_kernel_key); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 30dbe07d7afca6473785d7a64be6864534b84e3c..d7c0c8cc547e6b04f67ddbb06121d139756d5142 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -332,6 +332,7 @@ void BuildDygraphPhiKernelContext( } for (size_t i = 0; i < attr_names.size(); ++i) { + VLOG(1) << "############## attr_name: " << i << " : " << attr_names[i]; if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) { if (attrs.find(attr_names[i]) != attrs.end()) { // shape is in the attribute @@ -409,6 +410,60 @@ void BuildDygraphPhiKernelContext( experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } + } else if (attr_defs[i].type_index == + std::type_index(typeid(std::vector))) { + auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + const auto& vec = BOOST_GET_CONST(std::vector, attr); + std::vector scalar_list; + scalar_list.reserve(vec.size()); + for (const auto& val : vec) { + scalar_list.emplace_back(val); + } + kernel_ctx->EmplaceBackAttr(std::move(scalar_list)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to vector when " + "construct KernelContext.", + attr_names[i])); + } } else { // TODO(chenweihang): support other attrs later auto& attr = GetAttr(attrs, default_attrs, attr_names[i]); @@ -432,7 +487,11 @@ void BuildDygraphPhiKernelContext( } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == - std::type_index(typeid(std::vector))) { + std::type_index(typeid(std::vector))) { + kernel_ctx->EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 3ac2028790608529e0745dde2ce41ed57748f46d..02a1689c23a3fe5e1543a2e52d7661d5997bc062 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -24,6 +24,10 @@ #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace platform = paddle::platform; namespace framework = paddle::framework; diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index f5ca13cb99ad3df6b9283565b5681c36f7197ae8..4cda3f32fdf3fdd2d14b201fa902c1f50f3ff98d 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -24,6 +24,13 @@ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/type_defs.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(relu, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -226,7 +233,7 @@ TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) { } // namespace paddle USE_OP_ITSELF(split); -USE_OP(relu); +USE_OP_ITSELF(relu); #ifdef PADDLE_WITH_MKLDNN USE_OP_DEVICE_KERNEL(relu, MKLDNN); #endif diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index d05036f7a12ebdc3db5fbfda5eb50c295c0478e4..2e38bd77cf63cc85b75a50e62250a6e746f525bc 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -28,6 +28,13 @@ #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +#endif namespace imperative = paddle::imperative; namespace platform = paddle::platform; @@ -591,5 +598,5 @@ TEST(test_tracer, eager_tracer) { USE_OP(mul); USE_OP(mul_grad); USE_OP_ITSELF(reduce_sum); -USE_OP(reduce_sum_grad); +USE_OP_ITSELF(reduce_sum_grad); USE_OP_ITSELF(elementwise_add); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 26b8b9e8e17e046964d648f564c26293036e4033..5d0c3c98d2f618eb1f3d41e6a4e2434e5cd80401 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -45,6 +45,11 @@ add_subdirectory(api) set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) + +if(WITH_ONNXRUNTIME) + set(STATIC_INFERENCE_API ${STATIC_INFERENCE_API} onnxruntime_predictor) +endif() + #TODO(wilber, T8T9): Do we still need to support windows gpu static library? if(WIN32 AND WITH_GPU) cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules}) @@ -91,6 +96,13 @@ if (WITH_PSCORE) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} fleet ps_service) endif () +if (WITH_ONNXRUNTIME) + set(SHARED_INFERENCE_SRCS ${SHARED_INFERENCE_SRCS} + ${CMAKE_CURRENT_SOURCE_DIR}/api/onnxruntime_predictor.cc + ) + set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} onnxruntime_predictor) +endif (WITH_ONNXRUNTIME) + # Create shared inference library cc_library(paddle_inference_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${SHARED_INFERENCE_DEPS}) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 6eeb5d64253597382de611b25550e60a8e83eb45..bdc16ef4c7907764473c552461cde35f011ad489 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -31,7 +31,7 @@ cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tens cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) set(paddle_inference_api_deps lod_tensor scope reset_tensor_array - analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator phi_custom_kernel) + analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator) if(WITH_CRYPTO) list(APPEND paddle_inference_api_deps paddle_crypto) @@ -49,8 +49,15 @@ if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) endif() -cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} - zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +if (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx) + cc_library(onnxruntime_predictor SRCS onnxruntime_predictor.cc DEPS analysis_predictor) +else (WITH_ONNXRUNTIME) + cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} + zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils) +endif (WITH_ONNXRUNTIME) + cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -75,6 +82,16 @@ elseif (WIN32) ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() +if (WITH_ONNXRUNTIME) + if (NOT APPLE AND NOT WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS paddle_inference_shared + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + elseif (WIN32) + cc_test(test_onnxruntime_predictor SRCS onnxruntime_predictor_tester.cc DEPS onnxruntime_predictor benchmark ${inference_deps} + ARGS --dirname=${MOBILENETV2_MODEL_DIR}) + endif() +endif() + if(WITH_TESTING AND WITH_MKLDNN) if (NOT APPLE AND NOT WIN32) cc_test(test_mkldnn_quantizer SRCS mkldnn_quantizer_tester.cc DEPS paddle_inference_shared ARGS --dirname=${WORD2VEC_MODEL_DIR}) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 9c33d7003064532db7276d0f6dad90e1b2c55104..41c01d3b7e261314d8dc6b852f5b2a597421fe48 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -168,6 +168,33 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16, int ipu_replica_num, Update(); } +void AnalysisConfig::EnableONNXRuntime() { +#ifdef PADDLE_WITH_ONNXRUNTIME + use_onnxruntime_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableONNXRuntime()"; + use_onnxruntime_ = false; +#endif + + Update(); +} + +void AnalysisConfig::DisableONNXRuntime() { + use_onnxruntime_ = false; + Update(); +} + +void AnalysisConfig::EnableORTOptimization() { +#ifdef PADDLE_WITH_ONNXRUNTIME + enable_ort_optimization_ = true; +#else + LOG(ERROR) << "Please compile with onnxruntime to EnableORTOptimization()"; + enable_ort_optimization_ = false; +#endif + + Update(); +} + AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index df61b5103195d2bd9a18cb231076f0d89bc8bae9..871ed596a3ee9d6362b03e99ca10313765826a51 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -65,6 +65,10 @@ #include "paddle/fluid/inference/api/mkldnn_quantizer.h" #endif +#ifdef PADDLE_WITH_ONNXRUNTIME +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" +#endif + #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/helper.h" @@ -1762,6 +1766,27 @@ namespace paddle_infer { Predictor::Predictor(const Config &config) { const_cast(&config)->SwitchUseFeedFetchOps(false); // The second parameter indicates that the discard log is not printed + if (config.use_onnxruntime()) { +#ifdef PADDLE_WITH_ONNXRUNTIME + if (config.use_gpu()) { + LOG(WARNING) << "The current ONNXRuntime backend doesn't support GPU," + "and it falls back to use Paddle Inference."; + } else if (!paddle::CheckConvertToONNX(config)) { + LOG(WARNING) + << "Paddle2ONNX do't support convert the Model, fall back to using " + "Paddle Inference."; + } else { + predictor_ = paddle::CreatePaddlePredictor< + Config, paddle::PaddleEngineKind::kONNXRuntime>(config); + return; + } +#else + LOG(WARNING) + << "The onnxruntime backend isn't enabled," + " and please re-compile Paddle with WITH_ONNXRUNTIME option," + "fall back to using Paddle Inference."; +#endif + } predictor_ = paddle::CreatePaddlePredictor< Config, paddle::PaddleEngineKind::kAnalysis>(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 9c7e5c6b27e68ee10be5f8b56d6de4aea4524078..2c6e8f4f1a4d9ea0dfba8f400c7d3782a5e2c32d 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -357,6 +357,24 @@ TEST(AnalysisPredictor, set_xpu_device_id) { } #endif +TEST(AnalysisPredictor, enable_onnxruntime) { + AnalysisConfig config; + config.EnableONNXRuntime(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.use_onnxruntime()); +#else + ASSERT_TRUE(!config.use_onnxruntime()); +#endif + config.EnableORTOptimization(); +#ifdef PADDLE_WITH_ONNXRUNTIME + ASSERT_TRUE(config.ort_optimization_enabled()); +#else + ASSERT_TRUE(!config.ort_optimization_enabled()); +#endif + config.DisableONNXRuntime(); + ASSERT_TRUE(!config.use_onnxruntime()); +} + } // namespace paddle namespace paddle_infer { @@ -408,6 +426,14 @@ TEST(Predictor, Run) { predictor->TryShrinkMemory(); } +TEST(Predictor, EnableONNXRuntime) { + Config config; + config.SetModel(FLAGS_dirname); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + auto predictor = CreatePredictor(config); +} + TEST(Tensor, CpuShareExternalData) { Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index d03840ada36bce8cfdc2213284697e6d873cbde0..df98a7b05cf3f2035e9a21ec10e4b44eca843bbd 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -4,6 +4,7 @@ option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL. option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) option(USE_TENSORRT "Compile demo with TensorRT." OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -46,6 +47,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -151,6 +159,17 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -213,6 +232,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc new file mode 100644 index 0000000000000000000000000000000000000000..ef5c08cd041eb7af4c7f17a95c4fd9b8601e4bad --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/onnxruntime_mobilenet_demo.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo of mobilenet for tensorrt. + */ + +#include // use glog instead of CHECK to avoid importing other paddle header files. +#include +#include "gflags/gflags.h" +#include "utils.h" // NOLINT + +DEFINE_string(modeldir, "", "Directory of the inference model."); + +namespace paddle { +namespace demo { + +/* + * Use the onnxruntime engine to inference the demo. + */ +void Main() { + paddle::AnalysisConfig config; + config.EnableONNXRuntime(); + config.SetModel(FLAGS_modeldir + "/inference.pdmodel", + FLAGS_modeldir + "/inference.pdiparams"); + auto predictor = paddle_infer::CreatePredictor(config); + + // Inference. + std::vector input_shape = {1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto input_tensor = predictor->GetInputHandle(input_names[0]); + input_tensor->Reshape(input_shape); + auto output_tensor = predictor->GetOutputHandle(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + predictor->Run(); + output_tensor->CopyToCpu(out_data.data()); + + VLOG(3) << "output.size " << out_data.size(); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(); + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 5f062e8063253a08466b2491e80417af07047394..79a31555c7f0b1cb4a8d9c48bae16145d605935b 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -21,7 +21,8 @@ TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset USE_TENSORRT=$5 TENSORRT_ROOT_DIR=$6 # TensorRT root dir, default to /usr -MSVC_STATIC_CRT=$7 +WITH_ONNXRUNTIME=$7 +MSVC_STATIC_CRT=$8 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -38,6 +39,26 @@ else use_gpu_list='false' fi +mkdir -p $DATA_DIR +cd $DATA_DIR + +if [ $7 == ON ]; then + ONNXRUNTIME_LIB=${inference_install_dir}/third_party/install/onnxruntime/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${ONNXRUNTIME_LIB} + PADDLE2ONNX_LIB=${inference_install_dir}/third_party/install/paddle2onnx/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${PADDLE2ONNX_LIB} + #download model + mkdir -p MobileNetV2 + cd MobileNetV2 + if [[ -e "MobileNetV2.inference.model.tar.gz" ]]; then + echo "MobileNetV2.inference.model.tar.gz has been downloaded." + else + wget -q --no-proxy http://paddle-inference-dist.bj.bcebos.com/MobileNetV2.inference.model.tar.gz + tar xzf *.tar.gz + fi + cd .. +fi + PREFIX=inference-vis-demos%2F URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} @@ -58,8 +79,7 @@ function download() { fi cd .. } -mkdir -p $DATA_DIR -cd $DATA_DIR + vis_demo_list='se_resnext50 ocr mobilenet' for vis_demo_name in $vis_demo_list; do download $vis_demo_name @@ -93,7 +113,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do Release/simple_on_word2vec.exe \ @@ -112,7 +133,8 @@ for WITH_STATIC_LIB in ON OFF; do -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ - -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT + -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -138,7 +160,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release cpp_inference_demo.sln Release/trt_mobilenet_demo.exe \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -156,7 +179,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model' if [ -d $word2vec_model ]; then @@ -176,7 +200,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ - -DWITH_STATIC_LIB=$WITH_STATIC_LIB + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do @@ -200,7 +225,8 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ -DUSE_TENSORRT=$USE_TENSORRT \ - -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ @@ -211,6 +237,26 @@ for WITH_STATIC_LIB in ON OFF; do exit 1 fi fi + + # --------onnxruntime mobilenetv2 on linux/mac------ + if [ $WITH_ONNXRUNTIME == ON ]; then + rm -rf * + cmake .. -DPADDLE_LIB=${inference_install_dir} \ + -DWITH_MKL=$TURN_ON_MKL \ + -DDEMO_NAME=onnxruntime_mobilenet_demo \ + -DWITH_GPU=$TEST_GPU_CPU \ + -DWITH_STATIC_LIB=$WITH_STATIC_LIB \ + -DUSE_TENSORRT=$USE_TENSORRT \ + -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME + make -j$(nproc) + ./onnxruntime_mobilenet_demo \ + --modeldir=$DATA_DIR/MobileNetV2/MobileNetV2 + if [ $? -ne 0 ]; then + echo "onnxruntime demo onnxruntime_mobilenet_demo runs fail." + exit 1 + fi + fi fi done set +x diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc new file mode 100644 index 0000000000000000000000000000000000000000..ee82da139d8f39c26002763c4a4835050c48fc99 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc @@ -0,0 +1,354 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid//platform/device/gpu/gpu_types.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { + +framework::proto::VarType::Type ConvertONNXType( + ONNXTensorElementDataType type) { + switch (type) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: + return framework::proto::VarType::FP32; + // case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: + // return DataType::FP16; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: + return framework::proto::VarType::INT8; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: + return framework::proto::VarType::INT32; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: + return framework::proto::VarType::INT64; + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: + return framework::proto::VarType::UINT8; + default: + LOG(ERROR) << "unsupported ONNX Tensor Type: " << static_cast(type); + return framework::proto::VarType::FP32; + } +} + +bool CheckConvertToONNX(const AnalysisConfig &config) { + if (!config.model_dir().empty()) { + LOG(ERROR) << "Paddle2ONNX not support model_dir config"; + // TODO(heliqi jiangjiajun): Paddle2ONNX not support + // config.model_dir() + "/__model__" + // config.model_dir() + var_name + return false; + } else if (config.prog_file().empty() || config.params_file().empty()) { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s' or params path '%s'.", + config.model_dir(), config.prog_file(), config.params_file()); + return false; + } + return paddle2onnx::IsExportable(config.prog_file(), config.params_file(), + config.model_from_memory()); +} + +bool ONNXRuntimePredictor::Init() { + VLOG(3) << "ONNXRuntime Predictor::init()"; + + // Now ONNXRuntime only suuport CPU + if (config_.use_gpu()) { + place_ = paddle::platform::CUDAPlace(config_.gpu_device_id()); + } else { + place_ = paddle::platform::CPUPlace(); + } + scope_.reset(new paddle::framework::Scope()); + sub_scope_ = &scope_->NewScope(); + + std::string onnx_proto; + paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto, + config_.model_from_memory()); + + Ort::SessionOptions session_options; + if (config_.ort_optimization_enabled()) { + session_options.SetGraphOptimizationLevel( + GraphOptimizationLevel::ORT_ENABLE_ALL); + } + // Turn optimization off first, and then turn it on when it's stable + // session_options.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + // session_options.EnableCpuMemArena(); + // session_options.EnableMemPattern(); + // session_options.SetInterOpNumThreads(config_.cpu_math_library_num_threads()); + session_options.SetIntraOpNumThreads(config_.cpu_math_library_num_threads()); + VLOG(2) << "ONNXRuntime threads " << config_.cpu_math_library_num_threads(); + if (config_.profile_enabled()) { + LOG(WARNING) << "ONNXRuntime Profiler is activated, which might affect the " + "performance"; +#if defined(_WIN32) + session_options.EnableProfiling(L"ONNX"); +#else + session_options.EnableProfiling("ONNX"); +#endif + } else { + VLOG(2) << "ONNXRuntime Profiler is deactivated, and no profiling report " + "will be " + "generated."; + } + session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options}; + + auto memory_info = + Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); + Ort::Allocator allocator(session_, memory_info); + + framework::proto::VarType::Type proto_type = + framework::proto::VarType::LOD_TENSOR; + size_t n_inputs = session_.GetInputCount(); + for (size_t i = 0; i < n_inputs; ++i) { + auto input_name = session_.GetInputName(i, allocator); + auto type_info = session_.GetInputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + input_desc_.emplace_back(ONNXDesc{input_name, shape, data_type}); + auto *ptr = scope_->Var(input_name); + framework::InitializeVariable(ptr, proto_type); + allocator.Free(input_name); + } + + size_t n_outputs = session_.GetOutputCount(); + for (size_t i = 0; i < n_outputs; ++i) { + auto output_name = session_.GetOutputName(i, allocator); + auto type_info = session_.GetOutputTypeInfo(i); + std::vector shape = + type_info.GetTensorTypeAndShapeInfo().GetShape(); + ONNXTensorElementDataType data_type = + type_info.GetTensorTypeAndShapeInfo().GetElementType(); + output_desc_.emplace_back(ONNXDesc{output_name, shape, data_type}); + auto *ptr = scope_->Var(output_name); + framework::InitializeVariable(ptr, proto_type); + allocator.Free(output_name); + } + + return true; +} + +template <> +std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig &config) { + if (config.glog_info_disabled()) { + FLAGS_logtostderr = 1; + FLAGS_minloglevel = 2; // GLOG_ERROR + } + + PADDLE_ENFORCE_EQ( + config.is_valid(), true, + platform::errors::InvalidArgument( + "Note: Each config can only be used for one predictor.")); + + VLOG(3) << "create ONNXRuntimePredictor"; + + std::unique_ptr predictor(new ONNXRuntimePredictor(config)); + // Each config can only be used for one predictor. + config.SetInValid(); + auto predictor_p = dynamic_cast(predictor.get()); + + if (!predictor_p->Init()) { + return nullptr; + } + + return predictor; +} + +std::vector ONNXRuntimePredictor::GetInputNames() { + std::vector input_names; + for (auto input_desc : input_desc_) { + input_names.push_back(input_desc.name); + } + return input_names; +} + +std::map> +ONNXRuntimePredictor::GetInputTensorShape() { + std::map> input_shapes; + for (auto input_desc : input_desc_) { + input_shapes[input_desc.name] = input_desc.shape; + } + return input_shapes; +} + +std::vector ONNXRuntimePredictor::GetOutputNames() { + std::vector output_names; + for (auto output_desc : output_desc_) { + output_names.push_back(output_desc.name); + } + return output_names; +} + +std::unique_ptr ONNXRuntimePredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The in variable named %s is not found in the " + "scope of the ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()))); + res->input_or_output_ = true; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; +} + +std::unique_ptr ONNXRuntimePredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE_NOT_NULL(scope_->FindVar(name), + platform::errors::PreconditionNotMet( + "The out variable named %s is not found in the " + "scope of the ONNXPredictor.", + name)); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(scope_.get()))); + res->input_or_output_ = false; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = place_; + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; +} + +Ort::Value ONNXRuntimePredictor::GetOrtValue(const ONNXDesc &desc, + const char *device_name) { + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + size_t size = + tensor->numel() * + framework::SizeOfType(framework::TransToProtoVarType(tensor->dtype())); + std::vector shape = phi::vectorize(tensor->dims()); + return Ort::Value::CreateTensor(memory_info, + static_cast(tensor->data()), size, + shape.data(), shape.size(), desc.dtype); +} + +void ONNXRuntimePredictor::AsTensor(const Ort::Value &value, + const ONNXDesc &desc) { + auto info = value.GetTensorTypeAndShapeInfo(); + + auto *var = scope_->FindVar(desc.name); + auto *tensor = var->GetMutable(); + tensor->Resize(phi::make_ddim(info.GetShape())); + auto dtype = ConvertONNXType(info.GetElementType()); + auto *ptr = tensor->mutable_data(place_, dtype); + + if (platform::is_cpu_place(place_)) { + std::memcpy(ptr, const_cast(value.GetTensorData()), + tensor->numel() * framework::SizeOfType(dtype)); + } else { + auto src_place = place_; + auto dst_place = place_; + memory::Copy(dst_place, ptr, src_place, + const_cast(value.GetTensorData()), + tensor->numel() * framework::SizeOfType(dtype)); + } +} + +bool ONNXRuntimePredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + LOG(ERROR) << "Not support Run"; + return false; +} + +bool ONNXRuntimePredictor::ZeroCopyRun() { + try { + Ort::IoBinding binding(session_); + std::vector inputs; + std::vector outputs; + Ort::RunOptions options; + + inputs.reserve(input_desc_.size()); + const char *device_name = config_.use_gpu() ? "Cuda" : "Cpu"; + for (auto desc : input_desc_) { + inputs.push_back(GetOrtValue(desc, device_name)); + binding.BindInput(desc.name.c_str(), inputs.back()); + } + + // TODO(heliqi): Optimization —— move to Init() + for (auto desc : output_desc_) { + Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator, + place_.GetDeviceId(), OrtMemTypeDefault); + binding.BindOutput(desc.name.c_str(), memory_info); + } + + session_.Run({}, binding); + + outputs = binding.GetOutputValues(); + for (size_t i = 0; i < output_desc_.size(); ++i) { + AsTensor(outputs[i], output_desc_[i]); + } + } catch (const std::exception &e) { + LOG(ERROR) << e.what(); + return false; + } + + return true; +} + +std::unique_ptr ONNXRuntimePredictor::Clone() { + LOG(ERROR) << "Not support Clone(), Please create new Predictor"; + return nullptr; +} + +uint64_t ONNXRuntimePredictor::TryShrinkMemory() { + return paddle::memory::Release(place_); +} + +ONNXRuntimePredictor::~ONNXRuntimePredictor() { + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } + memory::Release(place_); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h new file mode 100644 index 0000000000000000000000000000000000000000..7fb07aa97bd2746773192456ddeba941a24e8906 --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor.h @@ -0,0 +1,225 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_compatible_info.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/device/gpu/gpu_types.h" +#include "paddle/fluid/string/printf.h" + +#include "onnxruntime_c_api.h" // NOLINT +#include "onnxruntime_cxx_api.h" // NOLINT +#include "paddle2onnx/converter.h" + +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif + +/// +/// \file onnxruntime_predictor.h +/// +/// \brief A predictor using ONNXRuntime +/// +/// \author heliqi@baidu.com +/// \date 2022-02-14 +/// \since 2.3.0 +/// + +namespace paddle { + +bool CheckConvertToONNX(const AnalysisConfig &config); + +struct ONNXDesc { + std::string name; + std::vector shape; + ONNXTensorElementDataType dtype; +}; + +/// +/// \class ONNXRuntimePredictor +/// +/// \brief The ONNXRuntimePredictor using ONNXRuntime for inference +/// +/// The predictor has the following typical uses: +/// +/// Get predictor +/// \code{cpp} +/// auto predictor = CreatePaddlePredictor(config); +/// \endcode +/// +/// Get input or output names +/// \code{cpp} +/// auto input_names = predictor->GetInputNames(); +/// auto output_names = predictor->GetOutputNames(); +/// \endcode +/// +/// Get input or output tensors +/// \code{cpp} +/// auto input_t = predictor->GetInputTensor(input_names[0]); +/// auto output_t = predictor->GetOutputTensor(output_names[0]); +/// \endcode +/// +/// Run predictor +/// \code{cpp} +/// predictor->ZeroCopyRun(); +/// \endcode +/// +class ONNXRuntimePredictor : public PaddlePredictor { + public: + /// + /// \brief Construct a new ONNXRuntime Predictor object + /// + /// \param[in] AnalysisConfig config + /// + explicit ONNXRuntimePredictor(const AnalysisConfig &config) + : config_(config) { + predictor_id_ = inference::GetUniqueId(); + env_ = Ort::Env(ORT_LOGGING_LEVEL_INFO, "onnx"); + } + /// + /// \brief Destroy the ONNXRuntime Predictor object + /// + ~ONNXRuntimePredictor(); + + /// + /// \brief Initialize predictor + /// + /// \return Whether the init function executed successfully + /// + bool Init(); + + /// + /// \brief Get the input names + /// + /// \return input names + /// + std::vector GetInputNames(); + + /// + /// \brief Get the output names + /// + /// \return output names + /// + std::vector GetOutputNames(); + + /// + /// \brief Get the Input Tensor object + /// + /// \param[in] name input name + /// \return input tensor + /// + std::unique_ptr GetInputTensor( + const std::string &name) override; + + /// + /// \brief Get the Output Tensor object + /// + /// \param[in] name otuput name + /// \return output tensor + /// + std::unique_ptr GetOutputTensor( + const std::string &name) override; + /// + /// \brief Get all input names and their corresponding shapes + /// + /// \return the map of input names and shapes + /// + std::map> GetInputTensorShape() override; + + /// Not supoort + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + /// + /// \brief Run the prediction engine + /// + /// \return Whether the function executed successfully + /// + bool ZeroCopyRun() override; + + /// + /// \brief Release all tmp tensor to compress the size of the memory pool. + /// The memory pool is considered to be composed of a list of chunks, if + /// the chunk is not occupied, it can be released. + /// + /// \return Number of bytes released. It may be smaller than the actual + /// released memory, because part of the memory is not managed by the + /// MemoryPool. + /// + uint64_t TryShrinkMemory() override; + /// + /// \brief Clone to get the new predictor. thread safe. + /// + /// \return get a new predictor + /// + std::unique_ptr Clone() override; + + std::shared_ptr scope_; + + private: + /// + /// \brief get the Ort Value(input Tensor). + /// + /// \param[in] desc ONNXDesce(name态shape态dtype) + /// + /// \param[in] device_name "cpu" or "gpu" of device + /// + /// \return get a Ort::Value + /// + Ort::Value GetOrtValue(const ONNXDesc &desc, const char *device_name); + + /// + /// \brief Ort::Value to Paddle::ZeroCopyTensor. + /// + /// \param[in] value Ort::Value(output Tensor) + /// + /// \param[in] desc a ONNXDesce(name态shape态dtype) + /// + /// \return get a Ort::Value + /// + void AsTensor(const Ort::Value &value, const ONNXDesc &desc); + + private: + AnalysisConfig config_; + + // ONNXRuntime + Ort::Env env_; + Ort::Session session_{nullptr}; + + platform::Place place_; + framework::Scope *sub_scope_{nullptr}; + std::vector input_desc_; + std::vector output_desc_; + int predictor_id_; + +// Some more detailed tests, they are made the friends of the predictor, so that +// the all the details can be tested. +#if PADDLE_WITH_TESTING + FRIEND_TEST(ONNXRuntimePredictor, onnxruntime_on); +#endif +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..2be2de9c60bb1c3fdedf13212d50a6f4e155d4df --- /dev/null +++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/onnxruntime_predictor.h" + +#include +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_api.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/inference/utils/io_utils.h" +#include "paddle/fluid/platform/cpu_info.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { + +TEST(ONNXRuntimePredictor, onnxruntime_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname + "/inference.pdmodel", + FLAGS_dirname + "/inference.pdiparams"); + config.EnableONNXRuntime(); + config.EnableORTOptimization(); + config.SetCpuMathLibraryNumThreads(2); + LOG(INFO) << config.Summary(); + + auto _predictor = + CreatePaddlePredictor(config); + ASSERT_TRUE(_predictor); + auto* predictor = static_cast(_predictor.get()); + + ASSERT_TRUE(predictor); + ASSERT_TRUE(!predictor->Clone()); + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // Dummy Input Data + std::vector input_shape = {-1, 3, 224, 224}; + std::vector input_data(1 * 3 * 224 * 224, 1.0); + std::vector out_data; + out_data.resize(1000); + + // testing all interfaces + auto input_names = predictor->GetInputNames(); + auto output_names = predictor->GetOutputNames(); + auto get_input_shape = predictor->GetInputTensorShape(); + + ASSERT_EQ(input_names.size(), 1UL); + ASSERT_EQ(output_names.size(), 1UL); + ASSERT_EQ(input_names[0], "inputs"); + ASSERT_EQ(output_names[0], "save_infer_model/scale_0.tmp_1"); + ASSERT_EQ(get_input_shape["inputs"], input_shape); + + auto input_tensor = predictor->GetInputTensor(input_names[0]); + input_tensor->Reshape({1, 3, 224, 224}); + auto output_tensor = predictor->GetOutputTensor(output_names[0]); + + input_tensor->CopyFromCpu(input_data.data()); + ASSERT_TRUE(predictor->ZeroCopyRun()); + output_tensor->CopyToCpu(out_data.data()); + + predictor->TryShrinkMemory(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index b4a358394404fa7d28838a00c96290747f146a1f..7b765e3fa8a24ef1b81b68da8ba12dd8e5577572 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -319,6 +319,18 @@ struct PD_INFER_DECL AnalysisConfig { /// void EnableNpu(int device_id = 0); /// + /// \brief Turn on ONNXRuntime. + /// + void EnableONNXRuntime(); + /// + /// \brief Turn off ONNXRuntime. + /// + void DisableONNXRuntime(); + /// + /// \brief Turn on ONNXRuntime Optimization. + /// + void EnableORTOptimization(); + /// /// \brief A boolean state telling whether the GPU is turned on. /// /// \return bool Whether the GPU is turned on. @@ -342,6 +354,19 @@ struct PD_INFER_DECL AnalysisConfig { /// bool use_ipu() const { return use_ipu_; } /// + /// \brief A boolean state telling whether the ONNXRuntime is turned on. + /// + /// \return bool Whether the ONNXRuntime is turned on. + /// + bool use_onnxruntime() const { return use_onnxruntime_; } + /// + /// \brief A boolean state telling whether the ONNXRuntime Optimization is + /// turned on. + /// + /// \return bool Whether the ONNXRuntime Optimization is turned on. + /// + bool ort_optimization_enabled() const { return enable_ort_optimization_; } + /// /// \brief Get the GPU device id. /// /// \return int The GPU device id. @@ -841,6 +866,10 @@ struct PD_INFER_DECL AnalysisConfig { bool use_npu_{false}; int npu_device_id_{0}; + // ONNXRuntime related + bool use_onnxruntime_{false}; + bool enable_ort_optimization_{false}; + // Padding related bool use_fc_padding_{true}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index c129efe494b4fb36bc72d3c93e24951ba7fef322..657dd9b600cce7173e3aa8d0156ba0975199cf98 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -192,6 +192,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { private: friend class AnalysisPredictor; + friend class ONNXRuntimePredictor; explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {} }; @@ -381,6 +382,7 @@ enum class PaddleEngineKind { kNative = 0, ///< Use the native Fluid facility. kAutoMixedTensorRT, ///< Automatically mix Fluid with TensorRT. kAnalysis, ///< More optimization. + kONNXRuntime, ///< Use ONNXRuntime }; template @@ -395,6 +397,11 @@ template <> PD_INFER_DECL std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config); +template <> +PD_INFER_DECL std::unique_ptr +CreatePaddlePredictor( + const AnalysisConfig& config); + PD_INFER_DECL int PaddleDtypeSize(PaddleDType dtype); PD_INFER_DECL std::string get_version(); diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index e342190fda1aca53a6814806e1afec1335224d79..d7b07652babbd1e24e2c650ac8ac079f03523d12 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -126,6 +126,26 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { return config->use_gpu(); } +void PD_ConfigEnableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableONNXRuntime(); +} + +void PD_ConfigDisableONNXRuntime(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableONNXRuntime(); +} + +PD_Bool PD_ConfigONNXRuntimeEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_onnxruntime(); +} + +void PD_ConfigEnableORTOptimization(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableORTOptimization(); +} + void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, int32_t l3_workspace_size, PD_Bool locked, PD_Bool autotune, const char* autotune_file, diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index c314aca918f141d30661d9034656899bbb816063..f6b754cad213f8d5249317468b5ceb21e863f6ad 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -152,6 +152,34 @@ PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu( PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( __pd_keep PD_Config* pd_config); /// +/// \brief Turn on ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn off ONNXRuntime. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableONNXRuntime( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the ONNXRutnime is turned on. +/// +/// \return Whether the ONNXRuntime is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigONNXRuntimeEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableORTOptimization( + __pd_keep PD_Config* pd_config); +/// /// \brief Turn on XPU. /// /// \param[in] pd_onfig config diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go index def26913b0a1c082b3a983cea5fa8021c468b59c..8f9f34c06b4768317d6f710ac49a7610a9ef9d6a 100644 --- a/paddle/fluid/inference/goapi/config.go +++ b/paddle/fluid/inference/goapi/config.go @@ -160,6 +160,36 @@ func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) { C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId)) } +/// +/// \brief Turn on ONNXRuntime. +/// +func (config *Config) EnableONNXRuntime() { + C.PD_ConfigEnableONNXRuntime(config.c) +} + +/// +/// \brief Turn off ONNXRuntime. +/// +func (config *Config) DisableONNXRuntime() { + C.PD_ConfigDisableONNXRuntime(config.c) +} + +/// +/// \brief A boolean state telling whether the ONNXRuntime is turned on. +/// +/// \return bool Whether the ONNXRuntime is turned on. +/// +func (config *Config) ONNXRuntimeEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigONNXRuntimeEnabled(config.c)) +} + +/// +/// \brief Turn on ONNXRuntime Optimization. +/// +func (config *Config) EnableORTOptimization() { + C.PD_ConfigEnableORTOptimization(config.c) +} + /// /// \brief Turn on XPU. /// diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go index b82161880839e500a20b787914e2827da151106b..297841dcbcf6c19aef4a536557ec30e76ea9f82c 100644 --- a/paddle/fluid/inference/goapi/config_test.go +++ b/paddle/fluid/inference/goapi/config_test.go @@ -122,3 +122,20 @@ func TestMkldnn(t *testing.T) { config.SetBfloat16Op([]string{"fc", "mul"}) } + +func TestONNXRuntime(t *testing.T) { + config := NewConfig() + config.SetModelDir("modelDir") + t.Log(config.ModelDir()) + + config.EnableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.DisableONNXRuntime() + t.Logf("ONNXRuntimeEnabled:%+v", config.ONNXRuntimeEnabled()) + + config.EnableORTOptimization() + + config.SetCpuMathLibraryNumThreads(4) + t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads()) +} \ No newline at end of file diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go index 40e518304510c57fec9cd7609ecbd6eefa456050..755558f96238d11842f8245c2b36210c60d8a057 100644 --- a/paddle/fluid/inference/goapi/predictor_test.go +++ b/paddle/fluid/inference/goapi/predictor_test.go @@ -66,6 +66,42 @@ func TestNewPredictor(t *testing.T) { cloned.ClearIntermediateTensor() } +func TestONNXRuntimePredictor(t *testing.T) { + t.Logf("Version:\n%+v", Version()) + config := NewConfig() + config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams") + config.EnableONNXRuntime() + config.EnableORTOptimization() + predictor := NewPredictor(config) + inNames := predictor.GetInputNames() + t.Logf("InputNames:%+v", inNames) + outNames := predictor.GetOutputNames() + t.Logf("OutputNames:%+v", outNames) + + inHandle := predictor.GetInputHandle(inNames[0]) + inHandle.Reshape([]int32{1, 3, 224, 224}) + t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape()) + + data := make([]float32, numElements([]int32{1, 3, 224, 224})) + for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ { + data[i] = float32(i%255) * 0.1 + } + inHandle.CopyFromCpu(data) + t.Logf("inHandle Type:%+v", inHandle.Type()) + + predictor.Run() + + outHandle := predictor.GetOutputHandle(outNames[0]) + t.Logf("outHandle name:%+v", outHandle.Name()) + + outShape := outHandle.Shape() + t.Logf("outHandle Shape:%+v", outShape) + outData := make([]float32, numElements(outShape)) + outHandle.CopyToCpu(outData) + t.Log(outData) +} + + func TestFromBuffer(t *testing.T) { modelFile, err := os.Open("./mobilenetv1/inference.pdmodel") if err != nil { diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh index edccc2648c012fda9e22c2fc14ffe4f90dc26cfe..cff9fd4aa7ceada2a37d9650c9ce3653f0155447 100644 --- a/paddle/fluid/inference/goapi/test.sh +++ b/paddle/fluid/inference/goapi/test.sh @@ -22,6 +22,7 @@ fi # 2. set LD_LIBRARY_PATH export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/mklml/lib/:$PWD/paddle_inference_c/third_party/install/mkldnn/lib/:$PWD/paddle_inference_c/paddle/lib/ +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/paddle_inference_c/third_party/install/onnxruntime/lib/:$PWD/paddle_inference_c/third_party/install/paddle2onnx/lib/ # 3. go test go clean -testcache diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 8c61200f7f57cdf57b372c37c8f7cea40c4a8d4c..b69292827aa136fd1d8a1f66d80823e6344a6174 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -89,5 +89,5 @@ class DropoutOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index f2dc5ba1c7c2c832e0239f6a30760c354aaf4699..7f7313fbcb5969aafea47ad23248acd5a6ca3644 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -52,7 +52,7 @@ TEST(Relu6OpConverter, main) { test_activation("relu6"); } } // namespace inference } // namespace paddle -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP(sigmoid); USE_OP(tanh); USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index 474fd92071fb0795b868f0cd86591061cf8b6581..cf377396087637f115523ddc60a468e2a23d57d4 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -57,4 +57,4 @@ TEST(DropoutOpConverter, main) { } // namespace inference } // namespace paddle -USE_OP(dropout); +USE_OP_ITSELF(dropout); diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc index df0eb58c2bd587e69215602512cc51f19c97a978..a341ffd7a081c24500e3b061b0ce3510a2aaacbc 100644 --- a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -81,6 +81,18 @@ TEST(PD_Config, interface) { PD_ConfigSetBfloat16Op(config, 1, &ops_name); #endif + PD_ConfigEnableONNXRuntime(config); + bool onnxruntime_enabled = PD_ConfigONNXRuntimeEnabled(config); +#ifdef PADDLE_WITH_ONNXRUNTIME + EXPECT_TRUE(onnxruntime_enabled); +#else + EXPECT_FALSE(onnxruntime_enabled); +#endif + PD_ConfigDisableONNXRuntime(config); + bool onnxruntime_disabled = PD_ConfigONNXRuntimeEnabled(config); + EXPECT_FALSE(onnxruntime_disabled); + PD_ConfigEnableORTOptimization(config); + PD_ConfigEnableMemoryOptim(config, true); bool memory_enabled = PD_ConfigMemoryOptimEnabled(config); EXPECT_TRUE(memory_enabled); diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt index 9d83f8ff8fdc4756450c0fe9ae4d7096d9afa76f..f376cbd4fb302b1d7a038d958465f24db653e220 100644 --- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt +++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt @@ -5,6 +5,7 @@ option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." OFF) option(USE_TENSORRT "Compile demo with TensorRT." OFF) option(WITH_GTEST "Compile demo with GTEST" OFF) +option(WITH_ONNXRUNTIME "Compile demo with ONNXRuntime" OFF) if(NOT WITH_STATIC_LIB) add_definitions("-DPADDLE_WITH_SHARED_LIB") @@ -45,6 +46,13 @@ link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (WITH_ONNXRUNTIME) + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/include") + include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/include") + + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib") + link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib") +endif() if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") @@ -172,6 +180,16 @@ else() endif() endif() +if (WITH_ONNXRUNTIME) + if(WIN32) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.lib paddle2onnx) + elseif(APPLE) + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.1.10.0.dylib paddle2onnx) + else() + set(DEPS ${DEPS} ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/libonnxruntime.so.1.10.0 paddle2onnx) + endif() +endif() + if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} @@ -248,6 +266,14 @@ if(WIN32) COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release ) endif() + if(WITH_ONNXRUNTIME) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}onnxruntime/lib/onnxruntime.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE_LIB_THIRD_PARTY_PATH}paddle2onnx/lib/paddle2onnx.dll + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(NOT WITH_STATIC_LIB) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_inference.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} diff --git a/paddle/fluid/inference/tests/infer_ut/run.sh b/paddle/fluid/inference/tests/infer_ut/run.sh index dd4b64f28d739776ee750205d41b4dce35a97640..8123d3785003471fd5f63f24fbb1166913d7e571 100755 --- a/paddle/fluid/inference/tests/infer_ut/run.sh +++ b/paddle/fluid/inference/tests/infer_ut/run.sh @@ -20,7 +20,8 @@ TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode DATA_DIR=$4 # dataset TENSORRT_ROOT_DIR=$5 # TensorRT ROOT dir, default to /usr/local/TensorRT -MSVC_STATIC_CRT=$6 +WITH_ONNXRUNTIME=$6 +MSVC_STATIC_CRT=$7 inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir EXIT_CODE=0 # init default exit code WIN_DETECT=$(echo `uname` | grep "Win") # detect current platform @@ -144,7 +145,8 @@ function compile_test() { -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT \ -DWITH_GTEST=ON \ -DCMAKE_CXX_FLAGS='/std:c++17' \ - -DCMAKE_BUILD_TYPE=Release + -DCMAKE_BUILD_TYPE=Release \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME msbuild /maxcpucount /property:Configuration=Release ALL_BUILD.vcxproj else cmake .. -DPADDLE_LIB=${inference_install_dir} \ @@ -154,7 +156,8 @@ function compile_test() { -DWITH_STATIC_LIB=OFF \ -DUSE_TENSORRT=$USE_TENSORRT \ -DTENSORRT_ROOT=$TENSORRT_ROOT_DIR \ - -DWITH_GTEST=ON + -DWITH_GTEST=ON \ + -DWITH_ONNXRUNTIME=$WITH_ONNXRUNTIME make -j$(nproc) fi; cd - diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 05c468b798886ac135ed30bff75ce9400f1ca3a1..6b6c0cd22f03b902f08d7a79236b1091b9fe6677 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -80,6 +80,14 @@ if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inferenc endif() set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") +if(WITH_ONNXRUNTIME) + set(MOBILENETV2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/MobileNetV2") + if(NOT EXISTS ${MOBILENETV2_INSTALL_DIR}/MobileNetV2.inference.model.tar.gz) + inference_download_and_uncompress_without_verify(${MOBILENETV2_INSTALL_DIR} ${INFERENCE_URL} "MobileNetV2.inference.model.tar.gz") + endif() + set(MOBILENETV2_MODEL_DIR "${MOBILENETV2_INSTALL_DIR}/MobileNetV2") +endif() + function (inference_base_test_build TARGET) set(options "") set(oneValueArgs "") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 91a0352e1915e95378012aa398ff996cbc10f216..e77be832c0cc8975c3fc2ebb7fad577cdfe919f5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -161,7 +161,7 @@ cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEP set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows_utils lapack_function lod_tensor maxouting unpooling pooling lod_rank_table context_project -sequence_pooling segment_pooling executor device_memory_aligment generator) +sequence_pooling executor device_memory_aligment generator) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse matrix_solve) diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc index 0ac29e6d3ada7335cab510ef82c9f46d2da7eb05..b4a97e24cf29233776b19aa0ea7764a00435f6fc 100644 --- a/paddle/fluid/operators/activation_cudnn_op.cu.cc +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -132,7 +132,9 @@ struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -146,7 +148,9 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { : CudnnActivationGradFunctor(ctx, 6.0, GPUDNN_ACTIVATION_CLIPPED_RELU) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -159,7 +163,9 @@ struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_SIGMOID) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -172,7 +178,9 @@ struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) : CudnnActivationGradFunctor(ctx, 0.0, GPUDNN_ACTIVATION_TANH) {} - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -197,7 +205,8 @@ class CudnnActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out."); + static_assert(Functor::FwdDeps() == ActBwdOpFwdDeps::kDepOut, + "Forward deps must be Out."); const framework::Tensor *X, *Out, *dOut; X = Out = dOut = nullptr; diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 73d65b7c6e7e0a5be2d680afba971d54b492c05d..66f1bcc8b68692abe588b6429b027462eaebde24 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -34,7 +34,8 @@ using paddle::framework::Tensor; template static constexpr bool CanInplaceAct() { - return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; + return GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kDepOut || + GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps; } #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ @@ -921,7 +922,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -931,7 +933,8 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DOut")) { ctx->ShareDim("Out", "DOut"); ctx->ShareLoD("Out", "DOut"); @@ -960,13 +963,15 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("X", "DDOut"); ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("DDOut")) { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); @@ -987,7 +992,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { if (ctx->HasOutput("DX")) { ctx->ShareDim("X", "DX"); ctx->ShareLoD("X", "DX"); @@ -997,7 +1003,8 @@ class ActivationOpTripleGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", "DDOut"); } } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { if (ctx->HasOutput("D_DOut")) { ctx->ShareDim("Out", "D_DOut"); ctx->ShareLoD("Out", "D_DOut"); @@ -1464,6 +1471,18 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +REGISTER_ACTIVATION_OP(cos, Cos, CosFunctor, CosGradFunctor) +REGISTER_ACTIVATION_OP(tan, Tan, TanFunctor, TanGradFunctor); +REGISTER_ACTIVATION_OP(acos, Acos, AcosFunctor, AcosGradFunctor); +REGISTER_ACTIVATION_OP(sin, Sin, SinFunctor, SinGradFunctor); +REGISTER_ACTIVATION_OP(asin, Asin, AsinFunctor, AsinGradFunctor); +REGISTER_ACTIVATION_OP(atan, Atan, AtanFunctor, AtanGradFunctor); +REGISTER_ACTIVATION_OP(sinh, Sinh, SinhFunctor, SinhGradFunctor); +REGISTER_ACTIVATION_OP(cosh, Cosh, CoshFunctor, CoshGradFunctor); +REGISTER_ACTIVATION_OP(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); +REGISTER_ACTIVATION_OP(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); +REGISTER_ACTIVATION_OP(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); + /* ========================== sigmoid register ============================= */ // 1. Register Sigmoid Operator @@ -1584,16 +1603,6 @@ REGISTER_OPERATOR( ops::ActivationOpDoubleGrad2::FwdDeps()>, ops::ActivationDoubleGradOpInplaceInferer); -REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluCPUFunctor, ReluGradFunctor); - -REGISTER_OP_CPU_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); /* ========================================================================== */ /* ======================== leaky relu register ============================ */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index ff41da86f7bb6ba8406d58804888b5dcd8bc3be0..4b79397b6cdf2e5c2993f7a72f512cc924c208e7 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -35,16 +35,14 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/kernels/funcs/activation_functor.h" + namespace paddle { namespace operators { using framework::To32BitIndex; -enum ActBwdOpFwdDeps { - kNoDeps = 0x00, // Do not need any forward input/output - kDepX = 0x01, // Only need forward input X - kDepOut = 0x02, // Only need forward output Out -}; +using ActBwdOpFwdDeps = phi::funcs::ActBwdOpFwdDeps; /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. @@ -89,7 +87,8 @@ inline void ExtractActivationGradTensor( auto x_grad_var = context.OutputVar(framework::GradVarName("X")); const framework::Variable* out_var = nullptr; - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { out_var = context.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, platform::errors::NotFound( @@ -139,7 +138,7 @@ inline void ExtractActivationGradTensor( "Output(Out), variable name = %s", context.OutputName(framework::GradVarName("X")))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = context.InputVar("X"); PADDLE_ENFORCE_NOT_NULL(x_var, platform::errors::NotFound( "Cannot get the tensor from the " @@ -248,6 +247,24 @@ struct SigmoidFunctor : public BaseActivationFunctor { } }; +#define USE_PHI_FUNCTOR(name) \ + template \ + using name##Functor = phi::funcs::name##Functor; \ + template \ + using name##GradFunctor = phi::funcs::name##GradFunctor; + +USE_PHI_FUNCTOR(Cos) +USE_PHI_FUNCTOR(Tan) +USE_PHI_FUNCTOR(Acos) +USE_PHI_FUNCTOR(Sin) +USE_PHI_FUNCTOR(Asin) +USE_PHI_FUNCTOR(Atan) +USE_PHI_FUNCTOR(Sinh) +USE_PHI_FUNCTOR(Cosh) +USE_PHI_FUNCTOR(Asinh) +USE_PHI_FUNCTOR(Acosh) +USE_PHI_FUNCTOR(Atanh) + template struct SigmoidGradFunctor : public BaseActivationFunctor { template { dx.device(d) = dout * out * (static_cast(1) - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -293,7 +312,9 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* @@ -351,7 +372,9 @@ struct SigmoidTripleGradFunctor : public BaseActivationFunctor { (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // silu(x) = x / (1 + exp(-x)) @@ -376,7 +399,7 @@ struct SiluGradFunctor : public BaseActivationFunctor { (static_cast(1) + (temp2 / temp1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // Originally: logsigmoid(x) = -log (1 + exp(-x)) @@ -414,7 +437,7 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor { dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // exp(x) = e^x @@ -434,7 +457,9 @@ struct ExpGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // expm1(x) = e^x - 1 @@ -454,38 +479,23 @@ struct Expm1GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // relu(x) = max(x, 0) -template -struct ReluCPUFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr([] HOSTDEVICE(T v) { - return v > static_cast(0) ? v : static_cast(0); - }); - } -}; template -struct ReluCUDAFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.cwiseMax(static_cast(0)); - } -}; +using ReluCPUFunctor = phi::funcs::ReluCPUFunctor; +template +using ReluGradFunctor = phi::funcs::ReluGradFunctor; template -struct ReluGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * (out > static_cast(0)).template cast(); - } +using ReluGradGradFunctor = phi::funcs::ReluGradGradFunctor; - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; +template +using ReluCUDAFunctor = phi::funcs::ReluCUDAFunctor; // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) template @@ -504,7 +514,9 @@ struct TanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -534,7 +546,9 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; /* Out @@ -589,7 +603,9 @@ struct TanhTripleGradFunctor : public BaseActivationFunctor { static_cast(2) * out * dout * d_dOutNew; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // tanhshrink(x) = x - tanh(x) @@ -610,7 +626,7 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x.tanh() * x.tanh()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // tanhshrink(x) = x - tanh(x) @@ -646,7 +662,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 || temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 @@ -682,7 +698,7 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // sqrt(x) = x^(1/2) @@ -702,7 +718,9 @@ struct SqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0.5) * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // rsqrt(x) = x^(-1/2) @@ -722,7 +740,9 @@ struct RsqrtGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(-0.5) * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // ceil(x) = ceiling(x) @@ -742,7 +762,9 @@ struct ZeroGradFunctor : public BaseActivationFunctor { dx.device(d) = static_cast(0) * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; + } }; // floor(x) = flooring(x) @@ -754,373 +776,6 @@ struct FloorFunctor : public BaseActivationFunctor { } }; -template -struct Sine { - HOSTDEVICE T operator()(const T& val) const { return sin(val); } -}; - -template <> -struct Sine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sin(static_cast(val))); - } -}; - -template -struct Cosine { - HOSTDEVICE T operator()(const T& val) const { return cos(val); } -}; - -template <> -struct Cosine { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(cos(static_cast(val))); - } -}; - -// cosine'(x) = -sin(x) -template -struct CosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = -dout * x.unaryExpr(Sine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosine(x) = cos(x) -template -struct CosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosine()); - } -}; - -// sine'(x) = cos(x) -template -struct SinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosine()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// sine(x) = sin(x) -template -struct SinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sine()); - } -}; - -template -struct Tangent { - HOSTDEVICE T operator()(const T& val) const { return tan(val); } -}; - -template <> -struct Tangent { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(tan(static_cast(val))); - } -}; - -// Tangent'(x) = -Tangent(x) -template -struct TanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout / x.unaryExpr(Cosine()).square(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// Tangent(x) = tan(x) -template -struct TanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Tangent()); - } -}; - -template -struct Sinh { - HOSTDEVICE T operator()(const T& val) const { return sinh(val); } -}; - -template <> -struct Sinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(sinhf(static_cast(val))); - } -}; - -template -struct Cosh { - HOSTDEVICE T operator()(const T& val) const { return cosh(val); } -}; - -template <> -struct Cosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(coshf(static_cast(val))); - } -}; - -// sinh(x) = sinh(x) -template -struct SinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Sinh()); - } -}; - -// cosh(x) = cosh(x) -template -struct CoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Cosh()); - } -}; - -// sinh'(x) = cosh(x) -template -struct SinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Cosh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -// cosh'(x) = sinh(x) -template -struct CoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * x.unaryExpr(Sinh()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acos { - HOSTDEVICE T operator()(const T& val) const { return acos(val); } -}; - -template <> -struct Acos { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acos(static_cast(val))); - } -}; - -// Acos(x) = acos(x) -template -struct AcosFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acos()); - } -}; - -// acos'(x) = -1/sqrt(1-x^2) -template -struct AcosGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asin { - HOSTDEVICE T operator()(const T& val) const { return asin(val); } -}; - -template <> -struct Asin { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asin(static_cast(val))); - } -}; - -// Asin(x) = asin(x) -template -struct AsinFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asin()); - } -}; - -// asin'(x) = 1/sqrt(1-x^2) -template -struct AsinGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atan { - HOSTDEVICE T operator()(const T& val) const { return atan(val); } -}; - -template <> -struct Atan { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atan(static_cast(val))); - } -}; - -// Atan(x) = atan(x) -template -struct AtanFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atan()); - } -}; - -// atan'(x) = 1 / (1 + x^2) -template -struct AtanGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Acosh { - HOSTDEVICE T operator()(const T& val) const { return acosh(val); } -}; - -template <> -struct Acosh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(acosh(static_cast(val))); - } -}; - -// Acosh(x) = acosh(x) -template -struct AcoshFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Acosh()); - } -}; - -// acosh'(x) = 1/sqrt(x^2 - 1) -template -struct AcoshGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x * x - static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Asinh { - HOSTDEVICE T operator()(const T& val) const { return asinh(val); } -}; - -template <> -struct Asinh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(asinh(static_cast(val))); - } -}; - -// Asinh(x) = asinh(x) -template -struct AsinhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Asinh()); - } -}; - -// asinh'(x) = 1/sqrt(x^2 + 1) -template -struct AsinhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = - dout * static_cast(1) / (x.square() + static_cast(1)).sqrt(); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct Atanh { - HOSTDEVICE T operator()(const T& val) const { return atanh(val); } -}; - -template <> -struct Atanh { - HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { - return platform::float16(atanh(static_cast(val))); - } -}; - -// Atanh(x) = atanh(x) -template -struct AtanhFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out) const { - out.device(d) = x.unaryExpr(Atanh()); - } -}; - -// atanh'(x) = 1/(1 - x^2) -template -struct AtanhGradFunctor : public BaseActivationFunctor { - template - void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - dx.device(d) = dout * static_cast(1) / (static_cast(1) - x.square()); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -1147,7 +802,9 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(-1) * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // log(x) = natural logarithm of x @@ -1167,7 +824,7 @@ struct LogGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log2(x) = logarithm to the base 2 of the elements of x @@ -1188,7 +845,7 @@ struct Log2GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log10(x) = logarithm to the base 10 of the elements of x @@ -1209,7 +866,7 @@ struct Log10GradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(1) / (x * static_cast(log(10))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // log1p(x) = natural logarithm of x+1 @@ -1229,7 +886,7 @@ struct Log1pGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) / (x + static_cast(1))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // square(x) = x^2 @@ -1249,7 +906,7 @@ struct SquareGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * static_cast(2) * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1285,7 +942,7 @@ struct BReluGradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // relu6(x) = min(max(0, x), 6) @@ -1319,7 +976,9 @@ struct Relu6GradFunctor : public BaseActivationFunctor { .template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; // HardSwish = min(max(0, x+3), 6) * x / 6 @@ -1364,7 +1023,7 @@ struct HardSwishGradFunctor : public BaseActivationFunctor { static_cast(1) * (static_cast(1) - tmp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // For numerical stability, using the following formula instead of softplus(x) = @@ -1409,7 +1068,7 @@ struct SoftplusGradFunctor : public BaseActivationFunctor { .select(dout, dout / (static_cast(1) + (-x_beta).exp())); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // mish(x) = x * tanh(softplus(x)) @@ -1449,7 +1108,7 @@ struct MishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (tsp + x * (static_cast(1) - tsp * tsp) * gsp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // softsign(x) = x / (1 + |x|) @@ -1472,7 +1131,7 @@ struct SoftsignGradFunctor : public BaseActivationFunctor { dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1504,7 +1163,9 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1539,7 +1200,7 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1573,7 +1234,7 @@ struct ELUGradFunctor : public BaseActivationFunctor { .select(dout, dout * (out + static_cast(alpha))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1592,7 +1253,7 @@ struct ELUGradNegativeAlphaFunctor : public BaseActivationFunctor { .select(dout, dout * static_cast(alpha) * x.exp()); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1672,7 +1333,7 @@ struct CELUGradFunctor : public BaseActivationFunctor { dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 @@ -1701,7 +1362,7 @@ struct PowGradFunctor : public BaseActivationFunctor { x.pow(static_cast(factor) - static_cast(1)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1766,7 +1427,7 @@ struct STanhGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * a * b * (static_cast(1) - temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1797,7 +1458,7 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * (x > th).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1832,7 +1493,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { static_cast(slope); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1865,7 +1528,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { dx.device(d) = dout * ((static_cast(beta) * out) + temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; /* @@ -1902,7 +1565,7 @@ inline void ExtractActivationDoubleGradTensor( "Cannot get the tensor from the Variable Output, variable name = %s", ctx.OutputName("DDX"))); - if (static_cast(kDepValue) & static_cast(kDepX)) { + if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepX)) { auto x_var = ctx.InputVar("X"); PADDLE_ENFORCE_NOT_NULL( x_var, platform::errors::NotFound( @@ -1925,7 +1588,8 @@ inline void ExtractActivationDoubleGradTensor( VLOG(10) << "Inplace activation of Op: " << ctx.Type(); *X = *ddX; } - if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { auto out_var = ctx.InputVar("Out"); PADDLE_ENFORCE_NOT_NULL( out_var, @@ -2000,28 +1664,7 @@ struct AbsGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * x.sign(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct ReluGradGradFunctor : public BaseActivationFunctor { - template - void operator()(const Device& dev, const framework::Tensor* X, - const framework::Tensor* Out, const framework::Tensor* ddX, - framework::Tensor* ddOut, framework::Tensor* dOut, - framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddX, "Input", "DDX", "ReluGradGrad")); - auto out = framework::EigenVector::Flatten( - GET_DATA_SAFELY(Out, "Output", "Out", "ReluGradGrad")); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "ReluGradGrad")); - ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); - } - } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2050,7 +1693,7 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2088,7 +1731,7 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2127,7 +1770,7 @@ struct CELUGradGradFunctor : public BaseActivationFunctor { .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -2156,7 +1799,9 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(0.5) / out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2185,7 +1830,9 @@ struct RsqrtGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(-0.5) * out * out * out; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -2214,7 +1861,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { ddout.device(*d) = ddx * static_cast(2) * x; } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need @@ -2840,7 +2487,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; } // namespace operators @@ -2849,20 +2496,9 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ - __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ - __macro(cos, Cos, CosFunctor, CosGradFunctor); \ - __macro(tan, Tan, TanFunctor, TanGradFunctor); \ - __macro(acos, Acos, AcosFunctor, AcosGradFunctor); \ - __macro(sin, Sin, SinFunctor, SinGradFunctor); \ - __macro(asin, Asin, AsinFunctor, AsinGradFunctor); \ - __macro(sinh, Sinh, SinhFunctor, SinhGradFunctor); \ - __macro(cosh, Cosh, CoshFunctor, CoshGradFunctor); \ - __macro(asinh, Asinh, AsinhFunctor, AsinhGradFunctor); \ - __macro(acosh, Acosh, AcoshFunctor, AcoshGradFunctor); \ - __macro(atanh, Atanh, AtanhFunctor, AtanhGradFunctor); \ __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps index 3b7ce9eaf2bea72b4b4b843ac4df91de47a688c4..92a101451e211f912e5390171654affa3be4e973 100644 --- a/paddle/fluid/operators/activation_op.kps +++ b/paddle/fluid/operators/activation_op.kps @@ -18,28 +18,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -struct CudaReluFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // relu(x) = max(x, 0) - __device__ __forceinline__ T operator()(const T x) const { - return x > zero ? x : zero; - } -}; - -template -struct CudaReluGradFunctor : public BaseActivationFunctor { - T zero = static_cast(0.0f); - - // dx = dout * (out > 0) - __device__ __forceinline__ T operator()(const T dout, const T out) const { - return out > zero ? dout : zero; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - template struct CudaLeakyReluFunctor : public BaseActivationFunctor { T zero = static_cast(0.0f); @@ -69,7 +47,7 @@ struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { return x > zero ? dout : static_cast(alpha) * dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -93,7 +71,9 @@ struct CudaSigmoidGradFunctor : public BaseActivationFunctor { return dout * out * (one - out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -122,7 +102,7 @@ struct CudaSiluGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp * (one + x * (one - temp)))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -159,30 +139,7 @@ struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // atan(x) = atan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atan(x)); - } -}; - -template -struct CudaAtanGradFunctor : public BaseActivationFunctor { - T one = static_cast(1.0f); - - // dx = dout / (1 + x^2) - __device__ __forceinline__ T operator()(const T dout, const T x) const { - return dout / (one + x * x); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -219,7 +176,7 @@ struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { return (x >= -l && x <= l) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -262,191 +219,9 @@ struct CudaZeroGradFunctor : public BaseActivationFunctor { return static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } -}; - -template -struct CudaCosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cos(x) = cos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cos(x)); - } -}; - -template -struct CudaCosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * (-sin(x)) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout * sin(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sin(x) = sin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sin(x)); - } -}; - -template -struct CudaSinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cos(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cos(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaTanFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // tan(x) = tan(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(tan(x)); - } -}; - -template -struct CudaTanGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout / cos(x)^2 - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / (cos(x) * cos(x))); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // asin(x) = asin(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asin(x)); - } -}; - -template -struct CudaAsinGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout / sqrt(one - x * x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kNoDeps; } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAcosFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // acos(x) = acos(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acos(x)); - } -}; - -template -struct CudaAcosGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = -dout / sqrt(1 - x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(-dout / sqrt(one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaCoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // cosh(x) = cosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(cosh(x)); - } -}; - -template -struct CudaCoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * sinh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * sinh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaSinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // sinh(x) = sinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(sinh(x)); - } -}; - -template -struct CudaSinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // dx = dout * cosh(x) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * cosh(x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template @@ -469,88 +244,11 @@ struct CudaTanhGradFunctor : public BaseActivationFunctor { return dout * (one - out * out); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template -struct CudaAcoshFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Acosh(x) = acosh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(acosh(x)); + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; } }; -template -struct CudaAcoshGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1 / sqrt(x^2 - 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x - one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAsinhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Asinh(x) = asinh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(asinh(x)); - } -}; - -template -struct CudaAsinhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - - // dx = dout * 1/sqrt(x^2 + 1) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / sqrt(x * x + one)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - -template -struct CudaAtanhFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - - // Atanh(x) = atanh(x) - __device__ __forceinline__ T operator()(const T arg_x) const { - MPType x = static_cast(arg_x); - return static_cast(atanh(x)); - } -}; - -template -struct CudaAtanhGradFunctor : public BaseActivationFunctor { - using MPType = typename details::MPTypeTrait::Type; - MPType one = static_cast(1.0f); - // dx = dout * 1/(1- x^2) - __device__ __forceinline__ T operator()(const T arg_dout, - const T arg_x) const { - MPType dout = static_cast(arg_dout); - MPType x = static_cast(arg_x); - return static_cast(dout * one / (one - x * x)); - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } -}; - template struct CudaReciprocalFunctor : public BaseActivationFunctor { T one = static_cast(1.0f); @@ -566,7 +264,9 @@ struct CudaReciprocalGradFunctor : public BaseActivationFunctor { return -dout * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -587,7 +287,9 @@ struct CudaExpGradFunctor : public BaseActivationFunctor { return dout * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -608,7 +310,9 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor { return dout * out + dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -629,7 +333,7 @@ struct CudaLogGradFunctor : public BaseActivationFunctor { return dout / x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -647,7 +351,7 @@ struct CudaSquareGradFunctor : public BaseActivationFunctor { return dout * two * x; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -670,7 +374,9 @@ struct CudaSqrtGradFunctor : public BaseActivationFunctor { return one_half * dout / out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -693,7 +399,9 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor { return minus_one_half * dout * out * out * out; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -717,7 +425,7 @@ struct CudaLog1pGradFunctor : public BaseActivationFunctor { return dout / (one + x); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -741,7 +449,7 @@ struct CudaLog2GradFunctor : public BaseActivationFunctor { return dout / (x * log_two); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -765,7 +473,7 @@ struct CudaLog10GradFunctor : public BaseActivationFunctor { return dout / (x * log_ten); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -804,7 +512,7 @@ struct CudaBReluGradFunctor : public BaseActivationFunctor { return (x > t_min_cast && x < t_max_cast) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -849,7 +557,9 @@ struct CudaSoftReluGradFunctor : public BaseActivationFunctor { : static_cast(0.0f); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -893,7 +603,7 @@ struct CudaSTanhGradFunctor : public BaseActivationFunctor { return static_cast(dout * a * b * (one - temp * temp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -939,7 +649,7 @@ struct CudaSoftplusGradFunctor : public BaseActivationFunctor { return x_beta > t ? arg_dout : static_cast(dout / (one + exp(-x_beta))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -962,7 +672,7 @@ struct CudaSoftsignGradFunctor : public BaseActivationFunctor { return dout / (temp * temp); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -996,7 +706,9 @@ struct CudaRelu6GradFunctor : public BaseActivationFunctor { return (out > zero && out < t) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1022,7 +734,7 @@ struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { return static_cast(dout * tanh(x) * tanh(x)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1056,7 +768,7 @@ struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { return (x > -t && x < t) ? zero : dout; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1097,7 +809,9 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { return (out > zero && out < one) ? dout * static_cast(slope) : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1141,7 +855,7 @@ struct CudaSwishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (temp2 + temp3)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1190,7 +904,7 @@ struct CudaMishGradFunctor : public BaseActivationFunctor { return static_cast(dout * (tsp + x * (one - tsp * tsp) * gsp)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1222,7 +936,7 @@ struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { return x > static_cast(threshold) ? dout : zero; } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1274,7 +988,7 @@ struct CudaHardSwishGradFunctor : public BaseActivationFunctor { return dout * (temp1 * temp2 * (two * x + o) / s + one - temp2); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1320,7 +1034,9 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { return static_cast(dout * (out_pos + out_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } + static constexpr ActBwdOpFwdDeps FwdDeps() { + return ActBwdOpFwdDeps::kDepOut; + } }; template @@ -1347,7 +1063,7 @@ struct CudaELUGradNegativeAlphaFunctor : public BaseActivationFunctor { return static_cast(dout * (x_pos + x_neg * (out + a))); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1429,7 +1145,7 @@ struct CudaCELUGradFunctor : public BaseActivationFunctor { temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; } }; template @@ -1477,13 +1193,14 @@ class ActivationGradCudaKernel std::vector ins = {d_out}; std::vector outs = {d_x}; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + if (static_cast(Functor::FwdDeps()) == + static_cast(ActBwdOpFwdDeps::kDepOut)) { // Only need forward output Out ins.push_back(out); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { + static_cast(ActBwdOpFwdDeps::kDepX)) { // Only need forward input X ins.push_back(x); paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, @@ -1602,50 +1319,6 @@ REGISTER_OP_CUDA_KERNEL( ops::CELUGradGradFunctor>); /* ========================================================================== */ -/* =========================== relu register ============================ */ -#ifdef PADDLE_WITH_HIP -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#else -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>, - ops::ActivationCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>, - ops::ActivationGradCudaKernel>); -REGISTER_OP_CUDA_KERNEL( - relu_grad_grad, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>, - ops::ActivationDoubleGradKernel>); -#endif -/* ========================================================================== */ - /* =========================== sigmoid register ============================ */ REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, @@ -1838,21 +1511,10 @@ REGISTER_OP_CUDA_KERNEL( __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ CudaLogSigmoidGradFunctor); \ - __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor); \ __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ CudaSoftShrinkGradFunctor); \ __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor); \ - __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor); \ - __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor); \ - __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor); \ - __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor); \ - __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor); \ - __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor); \ - __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor); \ - __macro(asinh, Asinh, CudaAsinhFunctor, CudaAsinhGradFunctor); \ - __macro(acosh, Acosh, CudaAcoshFunctor, CudaAcoshGradFunctor); \ - __macro(atanh, Atanh, CudaAtanhFunctor, CudaAtanhGradFunctor); \ __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor); \ __macro(reciprocal, Reciprocal, CudaReciprocalFunctor, \ CudaReciprocalGradFunctor); \ @@ -1891,8 +1553,6 @@ FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) REGISTER_ACTIVATION_XPU_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, CudaLeakyReluGradFunctor); -REGISTER_ACTIVATION_XPU_KERNEL(relu, Relu, CudaReluFunctor, - CudaReluGradFunctor); REGISTER_ACTIVATION_XPU_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, CudaSigmoidGradFunctor); REGISTER_ACTIVATION_XPU_KERNEL(exp, Exp, CudaExpFunctor, CudaExpGradFunctor); diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 0f5c048b6be9c73ae98181685269592f409196cd..c5e4188ca2d6f749a06127c41da99490a7fb3ffc 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -15,23 +15,19 @@ limitations under the License. */ #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_max, ArgMaxInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); + REGISTER_OPERATOR( arg_max, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMaxOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL( - arg_max, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel); + paddle::framework::EmptyGradOpMaker, + ArgMaxInferShapeFunctor); + REGISTER_OP_VERSION(arg_max) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h deleted file mode 100644 index b77031f7fb4c9d94f30ed06333b9c8766fd2310d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ /dev/null @@ -1,202 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#if defined(__NVCC__) || defined(__HIPCC__) - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include -#include -#include -#include -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -namespace { // NOLINT -template -using KeyValuePair = cub::KeyValuePair; -using Tensor = framework::Tensor; - -} // end namespace - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -template -__global__ void ArgCUDAKernel(const int64_t height, // n * h - const int64_t width, // c - const int64_t post_size, // h - const Reducer reducer, const T init, const T* in, - IndType* out) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int idx = blockIdx.x; idx < height; idx += gridDim.x) { - KeyValuePair kv_pair = {-1, init}; - int h = idx / post_size; - int w = idx % post_size; - for (int k = threadIdx.x; k < width; k += blockDim.x) { - kv_pair = - reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair); - } - kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); - if (threadIdx.x == 0) { - out[idx] = static_cast(kv_pair.key); - } - __syncthreads(); - } -} - -template -void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, - Tensor* indices, const int64_t pre, const int64_t post, - const int64_t n) { - auto cu_stream = ctx.stream(); - auto ComputeBlockSize = [](int64_t col) { - auto block_size = 8; - if (col > 512) - block_size = 1024; - else if (col > 256) - block_size = 512; - else if (col > 128) - block_size = 256; - else if (col > 64) - block_size = 128; - else if (col > 32) - block_size = 64; - else if (col > 16) - block_size = 32; - else if (col > 8) - block_size = 16; -#ifdef __HIPCC__ - block_size = std::min(block_size, 256); -#endif - return block_size; - }; - - int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; - int64_t height = pre * post; - int64_t width = n; - int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx; - - const T* in_data = input.data(); - IndType* out_data = indices->mutable_data(ctx.GetPlace()); - - if (typeid(Reducer) == typeid(cub::ArgMax)) { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::lowest(), - in_data, out_data)); - } - } else { - switch (ComputeBlockSize(width)) { - FIXED_BLOCK_DIM_CASE( - ArgCUDAKernel<<>>( - height, width, post, Reducer(), std::numeric_limits::max(), - in_data, out_data)); - } - } -} - -template -struct VisitDataCudaArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - int axis = ctx.Attr("axis"); - const bool& flatten = ctx.Attr("flatten"); - - framework::DDim input_dims; - if (flatten) { - input_dims = phi::make_ddim({input->numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - input_dims = input->dims(); - if (axis < 0) axis += input->dims().size(); - } - - int64_t numel = input->numel(); - int64_t groups = numel / input_dims[axis]; - int64_t pre = 1; - int64_t post = 1; - int64_t n = input_dims[axis]; - - for (int i = 0; i < axis; i++) { - pre *= input_dims[i]; - } - - for (int i = axis + 1; i < input_dims.size(); i++) { - post *= input_dims[i]; - } - - const auto& dev_ctx = ctx.cuda_device_context(); - ComputeFullArg(dev_ctx, *input, output, pre, post, n); - } -}; -template -class ArgMinMaxOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataCudaArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataCudaArgMinMaxFunctor(ctx)); - } -}; - -#endif - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index d3ce61d183a3d322e40966ce59f9a10320ceab4f..585341beea12c14fbd01a3a47af34ce57def0db5 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -27,193 +27,9 @@ limitations under the License. */ namespace paddle { namespace operators { -enum ArgMinMaxType { kArgMin, kArgMax }; - -template -struct ArgMinMaxFunctor {}; - -#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ - template \ - struct ArgMinMaxFunctor { \ - void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ - framework::LoDTensor* out, framework::DDim x_dims, \ - int64_t axis, bool keepdims) { \ - auto in_eigen = framework::EigenTensor::From(in, x_dims); \ - if (keepdims) { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } else { \ - auto out_eigen = framework::EigenTensor::From(*out); \ - out_eigen.device(*(ctx.eigen_device())) = \ - in_eigen.eigen_op_type(axis).template cast(); \ - } \ - } \ - } - -DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); -DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); - -template -struct VisitDataArgMinMaxFunctor { - const framework::ExecutionContext& ctx; - - explicit VisitDataArgMinMaxFunctor(const framework::ExecutionContext& ctx) - : ctx(ctx) {} - template - void apply() const { - auto& x = *(ctx.Input("X")); - auto& out = *(ctx.Output("Out")); - out.template mutable_data(ctx.GetPlace()); - auto axis = ctx.Attr("axis"); - auto keepdims = ctx.Attr("keepdims"); - const bool& flatten = ctx.Attr("flatten"); - // paddle do not have the scalar tensor, just return the shape [1] tensor - if (flatten) keepdims = true; - - // if flatten, will construct the new dims for the cacluate - framework::DDim x_dims; - if (flatten) { - x_dims = phi::make_ddim({x.numel()}); - // if flatten, the axis just as 0 - axis = 0; - } else { - x_dims = x.dims(); - if (axis < 0) axis += x_dims.size(); - } - auto& dev_ctx = ctx.template device_context(); - -#define CALL_ARG_MINMAX_FUNCTOR(rank) \ - ArgMinMaxFunctor \ - functor##rank; \ - functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims) - - switch (x_dims.size()) { - case 1: - CALL_ARG_MINMAX_FUNCTOR(1); - break; - case 2: - CALL_ARG_MINMAX_FUNCTOR(2); - break; - case 3: - CALL_ARG_MINMAX_FUNCTOR(3); - break; - case 4: - CALL_ARG_MINMAX_FUNCTOR(4); - break; - case 5: - CALL_ARG_MINMAX_FUNCTOR(5); - break; - case 6: - CALL_ARG_MINMAX_FUNCTOR(6); - break; - default: - PADDLE_ENFORCE_LE( - x_dims.size(), 6, - platform::errors::InvalidArgument( - "%s operator doesn't supports tensors whose ranks are greater " - "than 6.", - (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax"))); - break; -#undef CALL_ARG_MINMAX_FUNCTOR - } - } -}; - -template -class ArgMinMaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto& dtype = ctx.Attr("dtype"); - if (dtype < 0) { - framework::VisitDataTypeTiny( - static_cast( - framework::proto::VarType::INT64), - VisitDataArgMinMaxFunctor(ctx)); - return; - } - framework::VisitDataTypeTiny( - static_cast(dtype), - VisitDataArgMinMaxFunctor(ctx)); - } -}; - -template -using ArgMinKernel = ArgMinMaxKernel; - -template -using ArgMaxKernel = ArgMinMaxKernel; - class ArgMinMaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "arg_min_max"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "arg_min_max"); - const auto& x_dims = ctx->GetInputDim("X"); - int64_t axis = ctx->Attrs().Get("axis"); - bool keepdims = ctx->Attrs().Get("keepdims"); - const bool& flatten = ctx->Attrs().Get("flatten"); - - PADDLE_ENFORCE_GE(axis, -x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -Rank(X)(%d).", - axis, -x_dims.size())); - PADDLE_ENFORCE_LT( - axis, x_dims.size(), - platform::errors::InvalidArgument( - "'axis'(%d) must be less than Rank(X)(%d) of Input(X).", axis, - x_dims.size())); - - const int& dtype = ctx->Attrs().Get("dtype"); - PADDLE_ENFORCE_EQ( - (dtype < 0 || dtype == 2 || dtype == 3), true, - platform::errors::InvalidArgument( - "The attribute of dtype in argmin/argmax must be [%s] or [%s], but " - "received [%s]", - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64), - paddle::framework::DataTypeToString( - static_cast(dtype)))); - - auto x_rank = x_dims.size(); - if (axis < 0) axis += x_rank; - if (ctx->IsRuntime()) { - if (dtype == framework::proto::VarType::INT32) { - int64_t all_element_num = 0; - if (flatten) { - all_element_num = phi::product(x_dims); - - } else { - all_element_num = x_dims[axis]; - } - PADDLE_ENFORCE_LE( - all_element_num, INT_MAX, - platform::errors::InvalidArgument( - "The element num of the argmin/argmax input at axis is " - "%d, is larger than int32 maximum value:%d, you must " - "set the dtype of argmin/argmax to 'int64'.", - all_element_num, INT_MAX)); - } - } - std::vector vec; - if (flatten) { - vec.emplace_back(static_cast(1)); - } else { - for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]); - if (keepdims) { - vec.emplace_back(static_cast(1)); - } - for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]); - } - ctx->SetOutputDim("Out", phi::make_ddim(vec)); - } }; class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 0a4ba6fb0bfdfccfc4eae99da730e96fe5f0a540..fb3abd01af8c396d764f9f1d247f24c41bd15959 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -12,26 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/arg_min_max_op_base.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +DECLARE_INFER_SHAPE_FUNCTOR(arg_min, ArgMinInferShapeFunctor, + PD_INFER_META(phi::ArgMinMaxInferMeta)); REGISTER_OPERATOR( arg_min, paddle::operators::ArgMinMaxOp, paddle::operators::ArgMinOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + ArgMinInferShapeFunctor); -REGISTER_OP_CPU_KERNEL( - arg_min, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel); REGISTER_OP_VERSION(arg_min) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc index 9e525c20335d37242d0e239e81d2d2976b92a6b4..1a8aca777370bc140e39b7457702557042541744 100644 --- a/paddle/fluid/operators/argsort_op.cc +++ b/paddle/fluid/operators/argsort_op.cc @@ -12,40 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { class ArgsortOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "argsort"); - OP_INOUT_CHECK(ctx->HasOutput("Indices"), "Output", "Indices", "argsort"); - - auto in_dims = ctx->GetInputDim("X"); - int axis = ctx->Attrs().Get("axis"); - - auto num_dims = in_dims.size(); - PADDLE_ENFORCE_GE(axis, -num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be greater than or equal to" - " -num_dims(%d).", - axis, -num_dims)); - PADDLE_ENFORCE_LT( - axis, num_dims, - platform::errors::InvalidArgument( - "'axis'(%d) must be less than num_dims(%d).", axis, num_dims)); - - ctx->ShareDim("X", "Out"); - ctx->ShareDim("X", "Indices"); - ctx->ShareLoD("X", "Out"); - ctx->ShareLoD("X", "Indices"); - } }; class ArgsortGradOp : public framework::OperatorWithKernel { @@ -122,18 +101,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(ArgsortGradNoNeedBufferVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(argsort, ArgsortInferShapeFunctor, + PD_INFER_META(phi::ArgsortInferMeta)); REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker, ops::ArgsortGradOpMaker, - ops::ArgsortGradOpMaker); + ops::ArgsortGradOpMaker, + ArgsortInferShapeFunctor); REGISTER_OPERATOR(argsort_grad, ops::ArgsortGradOp, ops::ArgsortGradNoNeedBufferVarsInferer); -REGISTER_OP_CPU_KERNEL(argsort, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel, - ops::ArgsortKernel); -REGISTER_OP_CPU_KERNEL( - argsort_grad, ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel, - ops::ArgsortGradientKernel); diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu deleted file mode 100644 index 8b7a0b3eadb16bbe0822809748e343dc0d793a0f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/argsort_op.cu +++ /dev/null @@ -1,430 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/argsort_op.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -#ifdef __HIPCC__ -namespace rocprim { -namespace detail { -template <> -struct radix_key_codec_base - : radix_key_codec_integral {}; -} // namespace detail -} // namespace rocprim -#else -// set cub base traits in order to handle float16 -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -// Iter for move to next row -struct SegmentOffsetIter { - EIGEN_DEVICE_FUNC - explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { - return idx * num_cols_; - } - - int num_cols_; -}; - -template -static __global__ void FillIndex(T* indices, T num_rows, T num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (T j = row_id; j < num_rows; j += gridDim.x) { - for (T i = col_id; i < num_cols; i += blockDim.x) { - indices[j * num_cols + i] = i; - } - } -} - -template -static __global__ void FillFlattenGrad(const T* dO, const IndType* indices, - int64_t size, T* dX) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - for (int i = index; i < size; i += stride) { - dX[indices[i]] = dO[i]; - } -} - -template -static __global__ void FillGrad(const T* dO, const IndType* indices, T* dX, - IndType num_rows, IndType num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (IndType j = row_id; j < num_rows; j += gridDim.x) { - for (IndType i = col_id; i < num_cols; i += blockDim.x) { - dX[j * num_cols + indices[j * num_cols + i]] = dO[j * num_cols + i]; - } - } -} - -// Sort by flag descending, True: descending. False: Ascending. -// Default is false. -template -void ArgFullSort(const platform::CUDADeviceContext& ctx, const Tensor* input, - Tensor* output, Tensor* indices, const IndType num_rows, - const IndType num_cols, const bool descending) { - auto cu_stream = ctx.stream(); - - Tensor input_indices; - - const std::vector dims = {num_rows, num_cols}; - auto dim = phi::make_ddim(dims); - input_indices.Resize(dim); - input_indices.mutable_data(ctx.GetPlace()); - - size_t temp_storage_bytes = -1; - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - // Init a index array - FillIndex<<>>( - input_indices.data(), num_rows, num_cols); - - T* sorted_out_ptr; - IndType* sorted_indices_ptr; - - const T* inp = input->data(); - T* out = output->mutable_data(ctx.GetPlace()); - IndType* ind = indices->mutable_data(ctx.GetPlace()); - - sorted_out_ptr = out; - sorted_indices_ptr = ind; - - // create iter for counting input - cub::CountingInputIterator counting_iter(0); - // segment_offset is used for move to next row - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - - gpuError_t err; - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - nullptr, temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - PADDLE_ENFORCE_GPU_SUCCESS(err); - - Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - if (descending) { - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } else { - err = cub::DeviceSegmentedRadixSort::SortPairs( - temp_storage.data(), temp_storage_bytes, inp, sorted_out_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - } - - PADDLE_ENFORCE_GPU_SUCCESS(err); -} - -template -void ArgFullAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, Tensor* dX, const IndType num_rows, - const IndType num_cols) { - auto cu_stream = ctx.stream(); - - auto ComputeBlockSize = [](IndType col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - int maxGridDimX = ctx.GetCUDAMaxGridDimSize()[0]; - // actually, int num_rows < max_grid_size - int grid_size = num_rows < maxGridDimX ? num_rows : maxGridDimX; - FillGrad<<>>( - dO->data(), indices->data(), dX->data(), num_rows, - num_cols); -} - -template -void ArgFlattenAssign(const platform::CUDADeviceContext& ctx, const Tensor* dO, - const Tensor* indices, int64_t size, Tensor* dX) { - auto cu_stream = ctx.stream(); - - const int64_t block_size = - std::min(size, static_cast(ctx.GetMaxThreadsPerBlock())); - int64_t max_threads = ctx.GetMaxPhysicalThreadCount(); - const int64_t max_blocks = - std::max(((max_threads - 1) / block_size + 1), static_cast(1)); - const int64_t grid_size = - std::min(max_blocks, (size + block_size - 1) / block_size); - - FillFlattenGrad<<>>( - dO->data(), indices->data(), size, dX->data()); -} - -template -class ArgsortOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - const T* in_data = input->data(); - auto size = input->numel(); - T* out_data = output->mutable_data(ctx.GetPlace()); - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - - // Use thrust for parallel acceleration when the input size is equal to the - // length of the ā€˜axis’ dimension. - // Compared to the following 'Special case for full sort', ascending sort is - // 34 times faster and descending sort is 31 times faster. - if (size == in_dims[axis]) { - thrust::sequence(thrust::device, ids_data, ids_data + size); - thrust::copy(thrust::device, in_data, in_data + size, out_data); - thrust::sort_by_key(thrust::device, out_data, out_data + size, ids_data); - if (descending) { - thrust::reverse(thrust::device, out_data, out_data + size); - thrust::reverse(thrust::device, ids_data, ids_data + size); - } - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - ArgFullSort(dev_ctx, input, output, indices, input_height, - input_width, descending); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - T* trans_inp_data = trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - T* out_data = output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - // temp indices for sorting - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); - - ArgFullSort(dev_ctx, &trans_inp, &tmp_out, &tmp_indices, - input_height, input_width, descending); - - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - return; - } - } -}; - -template -class ArgsortGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - dX->mutable_data(ctx.GetPlace()); - if (dO->numel() == 0) return; - - auto in_dims = dX->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - int64_t size = dX->numel(); - const auto& dev_ctx = ctx.cuda_device_context(); - - // Parallel acceleration when the input size is equal to the length of the - // ā€˜axis’ dimension. - // Compared to 'special case for full sort' below, the gradient calculation - // is 10 times faster. - if (size == in_dims[axis]) { - ArgFlattenAssign(dev_ctx, dO, indices, size, dX); - return; - } - - // Special case for full sort, speedup ~190x. - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - ArgFullAssign(dev_ctx, dO, indices, dX, input_height, - input_width); - } else { - // if not full sort, do transpose first - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - ArgFullAssign(dev_ctx, &trans_dO, &trans_ind, &tmp_out, - input_height, input_width); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - return; - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - argsort, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel, - paddle::operators::ArgsortOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - argsort_grad, paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel, - paddle::operators::ArgsortGradOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h deleted file mode 100644 index d850e51a4bf061d3e5fc46bd53a2ef56610d6de9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/argsort_op.h +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -using Tensor = framework::Tensor; - -template -static void FullSort(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - bool descending) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.push_back(std::pair(e_input(i, j), j)); - } - } - std::sort(col_vec.begin(), col_vec.end(), - [&](const std::pair& l, const std::pair& r) { - if (descending) - return l.first > r.first; - else - return l.first < r.first; - }); - - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + j] = col_vec[j].first; - t_indices[i * input_width + j] = col_vec[j].second; - } - } -} - -template -static void FullAssign(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, - const framework::Tensor* indices, T* t_out) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = EigenVector::Flatten(*input); - auto e_indices = EigenVector::Flatten(*indices); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - t_out[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class ArgsortKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - int axis = ctx.Attr("axis"); - bool descending = ctx.Attr("descending"); - - auto in_dims = input->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - T* out_data = output->mutable_data(ctx.GetPlace()); - - // Do full sort - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); - FullSort(input_height, input_width, in_dims.size(), input, - out_data, ids_data, descending); - } else { - // If not full sort do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - output->mutable_data(ctx.GetPlace()); - - Tensor tmp_indices; - - auto* t_ind = - tmp_indices.mutable_data(trans_dims, ctx.GetPlace()); - - FullSort(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, descending); - - indices->mutable_data(ctx.GetPlace()); - TransCompute( - ndims, dev_ctx, tmp_indices, indices, trans); - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, - output, trans); - } - } -}; - -template -class ArgsortGradientKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* indices = ctx.Input("Indices"); - auto* dX = ctx.Output(framework::GradVarName("X")); - auto* dO = ctx.Input(framework::GradVarName("Out")); - int axis = ctx.Attr("axis"); - - auto in_dims = indices->dims(); - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - - dX->mutable_data(ctx.GetPlace()); - auto dxt = framework::EigenVector::Flatten(*dX); - auto& place = *ctx.template device_context() - .eigen_device(); - dxt.device(place) = dxt.constant(static_cast(0)); - if (dO->numel() == 0) return; - - // Do full assign - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - FullAssign(input_height, input_width, in_dims.size(), dO, - indices, dX->data()); - } else { - // If not full assign do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); - } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - auto& dev_ctx = ctx.template device_context(); - // Do transpose - TransCompute(ndims, dev_ctx, *dO, - &trans_dO, trans); - TransCompute( - ndims, dev_ctx, *indices, &trans_ind, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - FullAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out); - - // transpose back - TransCompute(ndims, dev_ctx, tmp_out, dX, - trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc index 077be715bece0b4119dc0a578a1cba4631eb45f2..c927eec00bc8bf9e84ad1fb53a907ff8ec71acbc 100644 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ b/paddle/fluid/operators/argsort_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/argsort_op_xpu.cc b/paddle/fluid/operators/argsort_op_xpu.cc index 18e81936a16c63a1d2693dfb47dc618c3e707ae0..359b00fcf87ee1bee27e668ae3973fa39be19d76 100644 --- a/paddle/fluid/operators/argsort_op_xpu.cc +++ b/paddle/fluid/operators/argsort_op_xpu.cc @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 72488a932d9c33cbfeddc9f35818e42ebe0137fa..b452dea8536dd98d6d4060d5224e39daf9137c50 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/cholesky_solve_op.cc b/paddle/fluid/operators/cholesky_solve_op.cc index 6b5bae8fc73fe2b71212a93144d89144dd0268c6..5403e2440ee58f1cf7cbad107f4d3e174655ed3b 100644 --- a/paddle/fluid/operators/cholesky_solve_op.cc +++ b/paddle/fluid/operators/cholesky_solve_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/operators/solve_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -39,50 +40,6 @@ class CholeskySolveOpMaker : public framework::OpProtoAndCheckerMaker { class CholeskySolveOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *context) const override { - OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "CholeskySolve"); - OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", "CholeskySolve"); - OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", "CholeskySolve"); - auto u_dims = context->GetInputDim("Y"); - auto b_dims = context->GetInputDim("X"); - int u_rank = u_dims.size(); - int b_rank = b_dims.size(); - PADDLE_ENFORCE_GE(u_rank, 2, - platform::errors::InvalidArgument( - "the rank of input Y must greater or equal to 2")); - PADDLE_ENFORCE_GE(b_rank, 2, - platform::errors::InvalidArgument( - "the rank of input X must greater or equal to 2")); - PADDLE_ENFORCE_EQ(u_dims[u_rank - 1], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "input Matrix Y should be square matrix," - "But Got last shape of %ld x %ld", - u_dims[u_rank - 1], u_dims[u_rank - 2])); - PADDLE_ENFORCE_EQ( - b_dims[b_rank - 2], u_dims[u_rank - 2], - platform::errors::InvalidArgument( - "the first dim of input X must equal to the dim of input Y," - "But Got %ld and %ld", - b_dims[b_rank - 2], u_dims[u_rank - 2])); - - std::vector u_dims_vec = phi::vectorize(u_dims); - std::vector b_dims_vec = phi::vectorize(b_dims); - - std::vector u_dims_vec_cut(u_dims_vec.begin(), - u_dims_vec.end() - 2); - std::vector b_dims_vec_cut(b_dims_vec.begin(), - b_dims_vec.end() - 2); - - std::vector expand_batch_portion = - get_broadcast_batch_portion(u_dims_vec_cut, b_dims_vec_cut); - - std::vector b_broadcast_dims({expand_batch_portion}); - b_broadcast_dims.insert(b_broadcast_dims.end(), - {b_dims_vec[b_rank - 2], b_dims_vec[b_rank - 1]}); - - // dim of 'Out' is the same with 'Y' after broadcast - context->SetOutputDim("Out", phi::make_ddim(b_broadcast_dims)); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -151,22 +108,15 @@ class CholeskySolveGradOp : public framework::OperatorWithKernel { } // namespace operators } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(cholesky_solve, CholeskySolveInferShapeFunctor, + PD_INFER_META(phi::CholeskySolveInferMeta)); + REGISTER_OPERATOR(cholesky_solve, ops::CholeskySolveOp, ops::CholeskySolveOpMaker, ops::CholeskySolveOpVarTypeInference, ops::CholeskySolveOpGradMaker, - ops::CholeskySolveOpGradMaker); + ops::CholeskySolveOpGradMaker, + CholeskySolveInferShapeFunctor); REGISTER_OPERATOR(cholesky_solve_grad, ops::CholeskySolveGradOp); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CPU_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); -// Complex<> is not supported because of TensorExpand, which used to boardcast -// input Tensor diff --git a/paddle/fluid/operators/cholesky_solve_op.cu b/paddle/fluid/operators/cholesky_solve_op.cu deleted file mode 100644 index 1b551a7cd0343db32a84e962212a25e1ff5a4893..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.cu +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver - -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/cholesky_solve_op.h" -#include "paddle/fluid/platform/dynload/cusolver.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; - -template -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, - int n, int nrhs, T *Adata, int lda, T *Bdata, int ldb, - int *devInfo); - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, float *Adata, - int lda, float *Bdata, int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnSpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs(const cusolverDnHandle_t &cusolverH, - cublasFillMode_t uplo, int n, int nrhs, - double *Adata, int lda, double *Bdata, int ldb, - int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnDpotrs( - cusolverH, uplo, n, nrhs, Adata, lda, Bdata, ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnCpotrs( - cusolverH, uplo, n, nrhs, reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template <> -void cusolver_potrs>( - const cusolverDnHandle_t &cusolverH, cublasFillMode_t uplo, int n, int nrhs, - platform::complex *Adata, int lda, platform::complex *Bdata, - int ldb, int *devInfo) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDnZpotrs( - cusolverH, uplo, n, nrhs, - reinterpret_cast(Adata), lda, - reinterpret_cast(Bdata), ldb, devInfo)); -} - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CUDADeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - cublasFillMode_t uplo = - upper ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER; - - /* step 1: get cusolver handle*/ - auto cusolverH = dev_ctx.cusolver_dn_handle(); - - /* step 2: solve A0*X0 = B0 */ - cusolver_potrs(cusolverH, uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - - PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize()); - } -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &in, Tensor *out, - const framework::ExecutionContext &ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - gpuStream_t stream = ctx.cuda_device_context().stream(); - TensorReduceImpl>( - ctx.cuda_device_context(), in, out, kps::IdentityFunctor(), - out_reduce_dims, stream); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cholesky_solve, - ops::CholeskySolveKernel, - ops::CholeskySolveKernel); - -REGISTER_OP_CUDA_KERNEL( - cholesky_solve_grad, - ops::CholeskySolveGradKernel, - ops::CholeskySolveGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h deleted file mode 100644 index 74b961d4e55e8a6ca231285e44bed3e3401461dc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ /dev/null @@ -1,252 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/solve_op.h" -#include "paddle/fluid/operators/triangular_solve_op.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/phi/kernels/funcs/lapack/lapack_function.h" -#include "paddle/phi/kernels/math_kernel.h" -#include "paddle/phi/kernels/transpose_kernel.h" - -namespace paddle { -namespace operators { // namespace operators - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::DeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo); -}; - -template -class CholeskySolveFunctor { - public: - void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n, - int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { - char uplo = upper ? 'U' : 'L'; - phi::funcs::lapackCholeskySolve(uplo, n, nrhs, Adata, lda, Bdata, lda, - devInfo); - } -}; - -template -void cholesky_solve_fn(const paddle::framework::ExecutionContext &ctx, - const framework::Tensor &uin, - const framework::Tensor &bin, framework::Tensor *out, - bool upper) { - const auto &dev_ctx = ctx.template device_context(); - // framework::Tensor broadcast - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(uin, bin); - framework::Tensor u_bst(uin.type()); - TensorExpand(dev_ctx, uin, &u_bst, u_bst_dims_vec); - - framework::Tensor b_bst(bin.type()); - TensorExpand(dev_ctx, bin, &b_bst, b_bst_dims_vec); - - auto &phi_dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE &>( - dev_ctx); - - // calculate u's conjugate for complex - framework::Tensor u_conj(u_bst.type()); - platform::ForRange u_for_range(dev_ctx, u_bst.numel()); - phi::funcs::ConjFunctor u_functor( - u_bst.data(), u_bst.numel(), - u_conj.mutable_data(u_bst.dims(), dev_ctx.GetPlace())); - u_for_range(u_functor); - u_conj = phi::TransposeLast2Dim(phi_dev_ctx, u_conj); - - // calculate b's conjugate for complex - framework::Tensor b_conj(b_bst.type()); - platform::ForRange b_for_range(dev_ctx, b_bst.numel()); - phi::funcs::ConjFunctor b_functor( - b_bst.data(), b_bst.numel(), - b_conj.mutable_data(b_bst.dims(), dev_ctx.GetPlace())); - b_for_range(b_functor); - b_conj = phi::TransposeLast2Dim(phi_dev_ctx, b_conj); - - auto ut_data = u_conj.mutable_data(dev_ctx.GetPlace()); - auto uindims = u_bst.dims(); - auto bindims = b_bst.dims(); - int uinrank = uindims.size(); - int binrank = bindims.size(); - - int n = uindims[uinrank - 2]; - int nrhs = bindims[binrank - 1]; - int ldab = std::max(1, n); - - // framework::Tensor out_copy(b_conj.type()); - // out_copy.Resize(b_conj.dims()); - framework::TensorCopy(b_conj, dev_ctx.GetPlace(), out); - T *out_data = out->mutable_data(dev_ctx.GetPlace()); - - auto info_dims = phi::slice_ddim(bindims, 0, binrank - 2); - auto batchsize = product(info_dims); - - framework::Tensor tmp; - std::vector tmpdim(1, batchsize); - tmp.Resize(phi::make_ddim(tmpdim)); - int *info = tmp.mutable_data(dev_ctx.GetPlace()); - - CholeskySolveFunctor functor; - for (int b = 0; b < batchsize; b++) { - auto uin_data_item = &ut_data[b * n * n]; - auto out_data_item = &out_data[b * n * nrhs]; - auto info_item = &info[b]; - functor(dev_ctx, upper, n, nrhs, uin_data_item, ldab, out_data_item, - info_item); - } - - // calculate out's conjugate for complex - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out->mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - *out = phi::TransposeLast2Dim(phi_dev_ctx, *out); -} - -template -class CholeskySolveKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto *uin = ctx.Input("Y"); - auto *bin = ctx.Input("X"); - auto *out = ctx.Output("Out"); - auto upper = ctx.Attr("upper"); - cholesky_solve_fn(ctx, *uin, *bin, out, upper); - } -}; - -template -class CholeskySolveGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - auto *bin = ctx.Input("X"); - auto *uin = ctx.Input("Y"); - auto *out = ctx.Input("Out"); - auto *dout = ctx.Input(framework::GradVarName("Out")); - auto *db = ctx.Output(framework::GradVarName("X")); - auto *du = ctx.Output(framework::GradVarName("Y")); - auto upper = ctx.Attr("upper"); - - const auto &dev_ctx = ctx.template device_context(); - auto &phi_dev_ctx = static_cast< - const typename framework::ConvertToPhiContext::TYPE &>( - dev_ctx); - - std::vector u_bst_dims_vec; - std::vector b_bst_dims_vec; - std::tie(u_bst_dims_vec, b_bst_dims_vec) = get_broadcast_dims(*uin, *bin); - framework::Tensor u_bst(uin->type()); - TensorExpand(dev_ctx, *uin, &u_bst, u_bst_dims_vec); - - framework::Tensor db_bst(bin->type()); - TensorExpand(dev_ctx, *bin, &db_bst, b_bst_dims_vec); - - if (dout) { - db->mutable_data(dev_ctx.GetPlace()); - cholesky_solve_fn(ctx, u_bst, *dout, &db_bst, upper); - - if (db_bst.dims() == db->dims()) { - framework::TensorCopy(db_bst, dev_ctx.GetPlace(), dev_ctx, db); - } else { - MatrixReduceSumFunctor functor; - functor(db_bst, db, ctx); - db->Resize(bin->dims()); - } - - auto blas = phi::funcs::GetBlas(ctx); - - // calculate out's conjugate for complex - framework::Tensor out_conj(out->type()); - platform::ForRange out_for_range(dev_ctx, out->numel()); - phi::funcs::ConjFunctor out_functor( - out->data(), out->numel(), - out_conj.mutable_data(out->dims(), dev_ctx.GetPlace())); - out_for_range(out_functor); - out_conj = phi::TransposeLast2Dim(phi_dev_ctx, out_conj); - - framework::Tensor commonterm(out->type()); - auto outdims = out_conj.dims(); - auto dbdims = db_bst.dims(); - auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(outdims, 0, false); - auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(dbdims, 0, false); - auto cmtdim = outdims; - cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2]; - commonterm.Resize(cmtdim); - commonterm.mutable_data(dev_ctx.GetPlace()); - blas.MatMul(db_bst, mat_dim_b, out_conj, mat_dim_a, static_cast(1), - &commonterm, static_cast(0)); - - // calculate commonterm's conjugate for complex - framework::Tensor commonterm_conj(commonterm.type()); - platform::ForRange commonterm_for_range( - dev_ctx, commonterm.numel()); - phi::funcs::ConjFunctor commonterm_functor( - commonterm.data(), commonterm.numel(), - commonterm_conj.mutable_data(commonterm.dims(), - dev_ctx.GetPlace())); - commonterm_for_range(commonterm_functor); - commonterm_conj = phi::TransposeLast2Dim(phi_dev_ctx, commonterm_conj); - - phi::AddRawKernel( - static_cast::TYPE &>(dev_ctx), - commonterm, commonterm_conj, -1, &commonterm); - - auto mat_dim_u = - phi::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false); - auto mat_dim_c = - phi::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false); - - Tensor du_bst(uin->type()); - // get upper or lower triangular - du_bst.Resize(u_bst.dims()); - du_bst.mutable_data(dev_ctx.GetPlace()); - if (upper) { - blas.MatMul(u_bst, mat_dim_u, commonterm, mat_dim_c, static_cast(-1), - &du_bst, static_cast(0)); - } else { - blas.MatMul(commonterm, mat_dim_c, u_bst, mat_dim_u, static_cast(-1), - &du_bst, static_cast(0)); - } - - const auto &udims = u_bst.dims(); - const auto H = udims[udims.size() - 2]; - const auto W = udims[udims.size() - 1]; - platform::ForRange x_for_range(dev_ctx, u_bst.numel()); - TrilTriuCompute tril_triu_computer(du_bst.data(), 0, !upper, H, W, - u_bst.data()); - x_for_range(tril_triu_computer); - - du->mutable_data(dev_ctx.GetPlace()); - if (u_bst.dims() == du->dims()) { - framework::TensorCopy(u_bst, dev_ctx.GetPlace(), dev_ctx, du); - } else { - MatrixReduceSumFunctor functor; - functor(u_bst, du, ctx); - du->Resize(uin->dims()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc index 2afee35112e6f74df74a095c492f60cce0f9786c..0edbee534c0b5d680717250e7702f272eacd0272 100644 --- a/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op_test.cc @@ -22,11 +22,17 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif + namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 460d417e61fd4c77dcc9d581a3b997e6ae7c8e4c..585f1caabed051134fd5ce7624c17b741b487ef0 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -26,12 +26,18 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/init.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(cinn_launch); USE_OP(cinn_instruction_run); USE_OP_ITSELF(elementwise_add); DECLARE_double(eager_delete_tensor_gb); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#ifdef PADDLE_WITH_CUDA +PD_DECLARE_KERNEL(add, GPU, ALL_LAYOUT); +#endif + namespace paddle::operators { using framework::paddle2cinn::CinnCompiler; diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f29bc57c9a5f4dbbfd53220ce187b386b3025e55 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#endif +#include "paddle/fluid/framework/convert_utils.h" + +namespace paddle { +namespace operators { + +template +class CAllGatherOpMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CNCL) + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + cnclDataType_t dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); + + int nranks = ctx.Attr("nranks"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = platform::CNCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + framework::DDim out_dims = x->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + uint32_t send_numel = x->numel(); + void* send_buff = reinterpret_cast(const_cast(x->data())); + void* recv_buff = reinterpret_cast(out->data()); + + mluStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(send_buff, recv_buff, send_numel, + dtype, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with MLU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(c_allgather, ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel, + ops::CAllGatherOpMLUKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc index c0968581acda9950aaa8ee2b8f3af15e1db59a67..7206dd01bcaa3e588cc275c2fdf25e70aacc1663 100644 --- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc index 31b00a93f1396564907a7872e919ba6c96f666d8..0946ad8aca65e28835ea1d139fb94c309ce840a1 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 7e5120cd2b392b1eb0698727ccebac485193f6d9..2c4e85400ca4adadce5db1fd318ce2273caa201f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -413,7 +413,7 @@ class CAllReduceOpMLUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(in->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype())); int64_t numel = in->numel(); const void* sendbuff = in->data(); out->Resize(in->dims()); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index 9c11704704ed420b14a6ccd9873e0bfbe143b4fe..61e5f27903477972ef10465ccfd6f8de8ce8fba6 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc index d315f211709e4f76c2d5c685721961a91c2102fe..d1e269fb5a4fe9505acf7043bc7a2cea36823ffa 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc @@ -31,7 +31,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel { auto out = ctx.Output("Out"); int numel = x->numel(); cnclDataType_t dtype = - platform::ToCNCLDataType(framework::TransToProtoVarType(x->type())); + platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype())); int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc index 5787090e6a52f2f37bd504a904108cd1d24caf5f..cf4d6a28744b368212fe8bcb0924001aa53b5a4e 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc index c79b2f92b69a1e6cc5c6f1cf17fa402c671a1997..c4e410d04da5fb5e9b6bfe4d7d5c263084889f54 100644 --- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc index d9a7a4abb08fc883b9b9210fcdefd56af127263a..8b498787c69db0f978acaa68ba63883270e11eb4 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index b8abf458c1c6d395fef08238abaa114ff5dc6e9e..133085ad3f3b0ffd00dbf4d026687b0311116951 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc index bb78971734bf05e94f7b0ebc1f1540b254f98067..36c6f4fadd0fcc9b06c61d5c45ce6829f2d3d977 100644 --- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc index 8f7b8c4a9040be3a2b4540c693c128e92c06a180..6e02d362156970cdee7257c7d00b70cef0519757 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc index c40b2c3e76a02ce6e5e754b2dc4280d6917145e7..57e3dd53cc7748fa0fb66e7e934a1c9cd764a15f 100644 --- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -25,7 +25,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc index 7c80917a71369e81dbab855a2dc9e0f6c35777e0..11633fb0b870327f14e4454b3f94a43940a9df53 100644 --- a/paddle/fluid/operators/cumsum_op.cc +++ b/paddle/fluid/operators/cumsum_op.cc @@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,17 +24,6 @@ namespace operators { class CumOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - if (ctx->Attrs().Get("flatten")) { - ctx->SetOutputDim("Out", - phi::make_ddim({phi::product(ctx->GetInputDim("X"))})); - } else { - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } - - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,10 +79,12 @@ class CumsumGradMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; - +DECLARE_INFER_SHAPE_FUNCTOR(cumsum, CumsumInferShapeFunctor, + PD_INFER_META(phi::CumsumInferMeta)); REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker, - ops::CumsumGradMaker); + ops::CumsumGradMaker, + CumsumInferShapeFunctor); REGISTER_OP_VERSION(cumsum) .AddCheckpoint( diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h index 375ef4344f4741c947ef3134696d64cdae696780..f89ecd37222870f73d00870c9454bf5590d504e3 100644 --- a/paddle/fluid/operators/determinant_op.h +++ b/paddle/fluid/operators/determinant_op.h @@ -19,11 +19,17 @@ #include #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/complex_kernel.h" +#include "paddle/phi/kernels/full_kernel.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" +#include "paddle/phi/kernels/funcs/unsqueeze.h" +#include "paddle/phi/kernels/math_kernel.h" +#include "paddle/phi/kernels/matmul_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace paddle { namespace operators { @@ -172,7 +178,7 @@ template class DeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* det = context.Input("Out"); const auto* grad = @@ -200,15 +206,18 @@ class DeterminantGradKernel : public framework::OpKernel { // checked in forward, pass } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (det(A)=0) if (!CheckMatrixInvertible(context, det)) { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; ddet->Resize(input->dims()); - ddet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, ddet, static_cast(0.0f)); + phi::Full(dev_ctx, phi::vectorize(input->dims()), static_cast(0.0f), + ddet); return; } @@ -218,35 +227,35 @@ class DeterminantGradKernel : public framework::OpKernel { // we set d|A| = unsqueeze(dA * |A|, [-1, -2]) * inverse(A).transpose(-2, // -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, inverse_A); + VLOG(3) << "(dA * |A|).transpose(-2, -1) dims: " << transpose_inverse_A.dims(); // Third: dA * |A| - auto mul_dA_detA = helper.Mul(*grad, *det); + auto mul_dA_detA = phi::Multiply(dev_ctx, *grad, *det); VLOG(3) << "dA * |A| dims: " << mul_dA_detA.dims(); // Fourth: unsqueeze(dA * |A|, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(mul_dA_detA, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(mul_dA_detA, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dA * |A|) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dA * |A|) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dA * |A|) * inverse(A) dims: " << res.dims(); @@ -331,7 +340,7 @@ template class SlogDeterminantGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = context.template device_context(); + auto& orig_dev_ctx = context.template device_context(); const auto* input = context.Input("Input"); const auto* slogdet = context.Input("Out"); const auto* grad = @@ -353,6 +362,10 @@ class SlogDeterminantGradKernel : public framework::OpKernel { input->dims().size() - grad->dims().size())); } + auto& dev_ctx = static_cast< + const typename framework::ConvertToPhiContext::TYPE&>( + orig_dev_ctx); + // Check Whether the matrix is invertible // (matrix A not invertible) == (absslogdet(A)=0) auto slogdet_vec = slogdet->Split(1, 0); @@ -361,9 +374,8 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // The matrix is not invertible VLOG(3) << "The input matrix not invertible!"; dslogdet->Resize(input->dims()); - dslogdet->mutable_data(context.GetPlace()); - phi::funcs::SetConstant zero; - zero(dev_ctx, dslogdet, std::numeric_limits::quiet_NaN()); + phi::Full(dev_ctx, phi::vectorize(input->dims()), + std::numeric_limits::quiet_NaN(), dslogdet); return; } @@ -373,34 +385,25 @@ class SlogDeterminantGradKernel : public framework::OpKernel { // we set dsl|A| = unsqueeze(dslA, [-1, -2]) * // inverse(A).conj().transpose(-2, -1) - math::DeviceIndependenceTensorOperations helper(context); - // First: inverse(A) framework::Tensor inverse_A; // A must be square matrices! inverse_A.Resize(input->dims()); inverse_A.mutable_data(context.GetPlace()); - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *input, &inverse_A); + phi::funcs::MatrixInverseFunctor mat_inv; + mat_inv(orig_dev_ctx, *input, &inverse_A); VLOG(3) << "inverse(A) dims: " << inverse_A.dims(); // Second: inverse(A).conj() - framework::Tensor conj_inverse_A; - conj_inverse_A.Resize(inverse_A.dims()); - auto numel = input->numel(); - auto* conj_data = conj_inverse_A.mutable_data(context.GetPlace(), - size_t(numel * sizeof(T))); - - platform::ForRange for_range(dev_ctx, numel); - phi::funcs::ConjFunctor functor(inverse_A.data(), numel, conj_data); - for_range(functor); + auto conj_inverse_A = phi::Conj(dev_ctx, inverse_A); VLOG(3) << "inverse(A).conj() dims: " << conj_inverse_A.dims(); // Third: inverse(A).conj().transpose(-2, -1) - framework::Tensor transpose_inverse_A = helper.Transpose(conj_inverse_A); + framework::Tensor transpose_inverse_A = + phi::TransposeLast2Dim(dev_ctx, conj_inverse_A); VLOG(3) << "inverse(A).conj().transpose(-2, -1) dims: " << transpose_inverse_A.dims(); @@ -417,12 +420,12 @@ class SlogDeterminantGradKernel : public framework::OpKernel { det_grad.Resize(det_grad.dims().reshape(det_grad_vec)); // Fifth: unsqueeze(dslA, [-1, -2]) - auto unsqueeze1 = helper.Unsqueeze(det_grad, -1); - auto unsqueeze2 = helper.Unsqueeze(unsqueeze1, -2); + auto unsqueeze1 = phi::funcs::Unsqueeze(det_grad, -1); + auto unsqueeze2 = phi::funcs::Unsqueeze(unsqueeze1, -2); VLOG(3) << "unsqueezed(dslA, [-1, -2]) dims: " << unsqueeze2.dims(); // Finally: unsqueeze(dslA) * inverse(A) - auto res = helper.Mul(unsqueeze2, transpose_inverse_A); + auto res = phi::Multiply(dev_ctx, unsqueeze2, transpose_inverse_A); VLOG(3) << "unsqueeze(dslA) * inverse(A) dims: " << res.dims(); framework::TensorCopy(res, context.GetPlace(), dslogdet); diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 17665ad67e40e8b73e63f37147c62f8566ab68f0..144198367d538e178a745c22902bb77a65f45fe4 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -32,10 +32,9 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/dropout_impl_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/phi/kernels/funcs/aligned_vector.h" +#include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/kernels/funcs/functors.h" namespace paddle { @@ -177,12 +176,13 @@ __global__ void DropoutGradCUDAKernel( } template -void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, - bool is_test, +void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, const std::string dropout_implementation, float dropout_prob, bool upscale_in_train, - bool is_fix_seed, int seed_val, const Tensor& x, - const Tensor* seed, Tensor* mask, Tensor* y) { + bool is_fix_seed, int seed_val, + const framework::Tensor& x, + const framework::Tensor* seed, + framework::Tensor* mask, framework::Tensor* y) { auto& place = *dev_ctx.eigen_device(); int64_t x_numel = x.numel(); auto stream = dev_ctx.stream(); @@ -220,7 +220,8 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, // VectorizedRandomGenerator use curand_uniform4, so we only support // vec_size is 4; int vec_size = (phi::GetVectorizedSize(x_data) == 4) ? 4 : 1; - auto gpu_config = GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); + auto gpu_config = + phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); auto offset = ((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; @@ -278,11 +279,13 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, } template -void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, +void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, const std::string dropout_implementation, - float dropout_prob, const Tensor& grad_y, - const Tensor& mask, int64_t size, - Tensor* grad_x, bool is_test = false) { + float dropout_prob, + const framework::Tensor& grad_y, + const framework::Tensor& mask, int64_t size, + framework::Tensor* grad_x, + bool is_test = false) { using MT = typename details::MPTypeTrait::Type; auto stream = dev_ctx.stream(); MT factor; diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h index d7db7dddce3887ca25ea1df34048f15663b2e987..c62d45570ba291dc60120c393d21842cc6548c61 100644 --- a/paddle/fluid/operators/dropout_impl_util.h +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, +inline void GetSeedDataAndIncrement(const phi::GPUContext& dev_ctx, const framework::Tensor* seed, const bool is_fix_seed, const int seed_val, const int offset, uint64_t* seed_data, diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 7613b04bccfdc2084decc0b383eec199f7e10991..6d52ce45c4c10099dbeb4d4fadbf91f8c390ef46 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -177,14 +177,3 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, ops::DropoutGradOpMaker, ops::DropoutGradOpMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); -REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel, - ops::CPUDropoutKernel, - ops::CPUDropoutKernel); -REGISTER_OP_CPU_KERNEL( - dropout_grad, - ops::DropoutGradKernel, - ops::DropoutGradKernel, - ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu deleted file mode 100644 index f6ddff1d0327d3c7961781f875da69f89df1edec..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dropout_op.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/dropout_impl.cu.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { - -// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. -// Use std::random and thrust::random(thrust is a std library in CUDA) to -// implement uniform random. -template -class GPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = context.cuda_device_context(); - auto* mask = context.Output("Mask"); - mask->mutable_data(context.GetPlace()); - - bool is_fix_seed = context.Attr("fix_seed"); - int seed_val = context.Attr("seed"); - DropoutFwGPUKernelDriver(dev_ctx, is_test, dropout_implementation, - dropout_prob, upscale_in_train, is_fix_seed, - seed_val, *x, seed, mask, y); - } -}; - -template -class GPUDropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - auto size = grad_x->numel(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - float dropout_prob = context.Attr("dropout_prob"); - - bool is_test = context.Attr("is_test"); - - auto& dev_ctx = - context.template device_context(); - DropoutGradGPUKernelDriver(dev_ctx, dropout_implementation, dropout_prob, - *grad_y, *mask, size, grad_x, is_test); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - dropout, ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL( - dropout_grad, ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel, - ops::GPUDropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h deleted file mode 100644 index ea6ed0e61947470c22f18e47acce2fca4cb9c41f..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/dropout_op.h +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include -#include -#include - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenVector = framework::EigenVector; - -template -class CPUDropoutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* seed = - context.HasInput("Seed") ? context.Input("Seed") : nullptr; - auto* y = context.Output("Out"); - const auto* x_data = x->data(); - auto* y_data = y->mutable_data(context.GetPlace()); - float dropout_prob = context.Attr("dropout_prob"); - - auto& dropout_implementation = - context.Attr("dropout_implementation"); - bool upscale_in_train = (dropout_implementation == "upscale_in_train"); - if (!context.Attr("is_test")) { - auto* mask = context.Output("Mask"); - auto* mask_data = mask->mutable_data(context.GetPlace()); - size_t size = phi::product(mask->dims()); - - // Special case when dropout_prob is 1.0 - if (dropout_prob == 1.0f) { - std::memset(y_data, 0, size * sizeof(*y_data)); // NOLINT - std::memset(mask_data, 0, size * sizeof(*mask_data)); // NOLINT - return; - } - // std::minstd_rand engine; - // NOTE: fixed seed should only be used in unittest or for debug. - // Guarantee to use random seed in training. - int seed_data = 0; - if (seed) { - seed_data = *(seed->data()); - } else { - seed_data = - context.Attr("fix_seed") ? context.Attr("seed") : 0; - } - auto engine = framework::GetCPURandomEngine(seed_data); - - std::uniform_real_distribution dist(0, 1); - - for (size_t i = 0; i < size; ++i) { - if (dist(*engine) < dropout_prob) { - mask_data[i] = 0; - y_data[i] = 0; - } else { - mask_data[i] = 1; - if (upscale_in_train) { - y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); - } else { - y_data[i] = x_data[i]; - } - } - } - } else { - if (upscale_in_train) { - const auto* X_data = x->data(); - auto* Y_data = y->mutable_data(context.GetPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < x->numel(); i++) { - Y_data[i] = X_data[i]; - } - } else { - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = - *context.template device_context().eigen_device(); - Y.device(place) = X * static_cast(1.0f - dropout_prob); - } - } - } -}; -template -class DropoutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* grad_x = context.Output(framework::GradVarName("X")); - auto* grad_y = context.Input(framework::GradVarName("Out")); - auto* mask = context.Input("Mask"); - grad_x->mutable_data(context.GetPlace()); - - auto dX = EigenVector::Flatten(*grad_x); - auto dY = EigenVector::Flatten(*grad_y); - - auto& place = - *context.template device_context().eigen_device(); - auto& dropout_implementation = - context.Attr("dropout_implementation"); - if (context.Attr("is_test") == true) { - if (dropout_implementation == "upscale_in_train") { - dX.device(place) = static_cast(1) * dY; - } else { - float dropout_prob = context.Attr("dropout_prob"); - dX.device(place) = dY * static_cast(1.0f - dropout_prob); - } - } else { - auto M = EigenVector::Flatten(*mask); - if (dropout_implementation == "upscale_in_train") { - float dropout_prob = context.Attr("dropout_prob"); - if (dropout_prob == 1.0f) { - dX.device(place) = static_cast(0) * dY; - } else { - dX.device(place) = - dY * M.cast() / static_cast(1.0f - dropout_prob); - } - } else { - dX.device(place) = dY * M.cast(); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc index 6aae566760623c666f3ce82a890a119e3e173390..07b3b5381162575cbfc03dd8cc10d0c88a2d21e8 100644 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" #include "paddle/phi/core/ddim.h" diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 206d9a6c5e9c9869216f0a6c137accc931aa2a77..bdf08646f1d8b94d6d8d141d8a9fa9864cdc937b 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -24,14 +24,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(dropout); +USE_OP_ITSELF(dropout); void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // init diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index 07b7e2cc7c09b09d6640f49fce438d58d0cc9cf2..7d8660f238abc8446b2988aad24a64c565e01ef9 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -8,15 +8,17 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/dropout_op.h" + #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/device_wrapper.h" namespace paddle { namespace operators { #ifdef PADDLE_WITH_XPU +using Tensor = framework::Tensor; template class DropoutXPUKernel : public framework::OpKernel { using XPUTyp = typename XPUTypeTrait::Type; diff --git a/paddle/fluid/operators/eigh_op.cc b/paddle/fluid/operators/eigh_op.cc index 553d0e679cc6ddebd68c3edbc2de70209364bb53..4e33c567eb6d12fc504bfd76bc83072836feda21 100644 --- a/paddle/fluid/operators/eigh_op.cc +++ b/paddle/fluid/operators/eigh_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eigh_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -22,42 +25,9 @@ using framework::Tensor; class EighOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", - "Eigh"); - OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", - "Eigh"); - - auto input_dim = ctx->GetInputDim("X"); - auto rank = input_dim.size(); - - PADDLE_ENFORCE_GE(rank, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions." - "But received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - input_dim[rank - 2], input_dim[rank - 1], - platform::errors::InvalidArgument( - "Eigh op is designed for square matrix, consequently" - "inner-most 2 dimensions of Input(X) should be symmetric." - "But received X's shape[-2] = %d and shape[-1] = %d.", - input_dim[rank - 2], input_dim[rank - 1])); - - std::vector values_dim; - - for (auto i = 0; i < rank - 1; i++) { - values_dim.emplace_back(input_dim[i]); - } - - ctx->SetOutputDim("Eigenvalues", phi::make_ddim(values_dim)); - ctx->SetOutputDim("Eigenvectors", input_dim); - } }; -class EignOpMaker : public framework::OpProtoAndCheckerMaker { +class EighOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", @@ -140,24 +110,11 @@ class EighGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(eigh, EighInferShapeFunctor, + PD_INFER_META(phi::EighInferMeta)); -REGISTER_OPERATOR(eigh, ops::EighOp, ops::EignOpMaker, +REGISTER_OPERATOR(eigh, ops::EighOp, ops::EighOpMaker, ops::EighGradOpMaker, - ops::EighGradOpMaker); + ops::EighGradOpMaker, + EighInferShapeFunctor); REGISTER_OPERATOR(eigh_grad, ops::EighGradOp); - -REGISTER_OP_CPU_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CPU_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.cu b/paddle/fluid/operators/eigh_op.cu deleted file mode 100644 index 827c551637d4df24529508ff37e6a92f157658a0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/eigh_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/eigh_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - eigh, ops::EighKernel, - ops::EighKernel, - ops::EighKernel>, - ops::EighKernel>); - -REGISTER_OP_CUDA_KERNEL( - eigh_grad, ops::EighGradKernel, - ops::EighGradKernel, - ops::EighGradKernel>, - ops::EighGradKernel>); diff --git a/paddle/fluid/operators/eigh_op.h b/paddle/fluid/operators/eigh_op.h deleted file mode 100644 index 5279ec750935c9b1b01584e893cc5e5f85d4a75c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/eigh_op.h +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/eigen_values_vectors.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class EighKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto input = ctx.Input("X"); - auto output_w = ctx.Output("Eigenvalues"); - auto output_v = ctx.Output("Eigenvectors"); - std::string lower = ctx.Attr("UPLO"); - bool is_lower = (lower == "L"); - math::MatrixEighFunctor functor; - functor(ctx, *input, output_w, output_v, is_lower, true); - } -}; - -template -class EighGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using ValueType = phi::dtype::Real; - auto& x_grad = *ctx.Output(framework::GradVarName("X")); - x_grad.mutable_data(ctx.GetPlace()); - auto& output_w = *ctx.Input("Eigenvalues"); - auto& output_v = *ctx.Input("Eigenvectors"); - auto& output_w_grad = - *ctx.Input(framework::GradVarName("Eigenvalues")); - auto& output_v_grad = - *ctx.Input(framework::GradVarName("Eigenvectors")); - - auto& dims = output_v.dims(); - const int m = dims[dims.size() - 1]; - auto dito = - math::DeviceIndependenceTensorOperations( - ctx); - auto tV = dito.Transpose(dito.Conj(output_v)); - auto W = dito.template Sub(dito.Unsqueeze(output_w, -2), - dito.Unsqueeze(output_w, -1)); - Tensor result = dito.Matmul(tV, output_v_grad); - result.mutable_data(dims, ctx.GetPlace()); - std::vector out_shape = phi::vectorize(dims); - auto constant = dito.Fill(out_shape, 0.5); - result = dito.Sub(result, dito.Conj(dito.Transpose(result))); - result = dito.Mul(result, constant); - result = dito.Div(result, W); - result = dito.DiagFill(m, m, m, 0, output_w_grad, result); - x_grad = dito.Matmul(output_v, dito.Matmul(result, tV)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 8e0bf78e9b7f9c08052c1463acf20c4493ffc9e1..14baeaa74d2421135401e94fbc10367d50b876fe 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -196,47 +196,6 @@ struct MinGradXYFunctor { } }; -template -struct MulGradFunctor { - inline HOSTDEVICE T operator()(const T a, const T b) const { return a * b; } -}; -template -struct MulGradFunctor> { - inline HOSTDEVICE Complex operator()(const Complex a, - const Complex b) const { - Complex b_conj(b.real, -b.imag); - return a * b_conj; - } -}; - -template -struct MulGradXYFunctor { - inline HOSTDEVICE phi::Array operator()(const InT a, const InT b, - const InT c) { - phi::Array outs; - // dx = dout * y - outs[0] = a * b; - // dy = dout * x - outs[1] = a * c; - return outs; - } -}; - -template -struct MulGradXYFunctor, Complex> { - inline HOSTDEVICE phi::Array, 2> operator()( - const Complex a, const Complex b, const Complex c) { - phi::Array, 2> outs; - // dx = dout * y - Complex b_conj(b.real, -b.imag); - outs[0] = a * b_conj; - // dy = dout * x - Complex c_conj(c.real, -c.imag); - outs[1] = a * c_conj; - return outs; - } -}; - // Ternary compare template struct MaxGradXFunctor { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index e172279145e28c0731ed0d8d91769d0b293662fe..830e09eeae4811eb44bd4e21e17fe83ee44c592d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -173,55 +173,6 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseMulKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CPU_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 45c87a27a180af4798a9f8b31e2edfd0cacb583d..f7b9fd1e265f5d3f107e734f9ffdcc90e7f6cc77 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -63,33 +63,6 @@ class ElementwiseMulKernel } }; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - const auto& dev_ctx = - ctx.template device_context(); - const auto place = ctx.GetPlace(); - - if (dx != nullptr && dy != nullptr) { - std::vector ins = {dout, y, x}; - GetGradXAndYOut( - dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor()); - } else if (dx != nullptr && dy == nullptr) { - std::vector ins = {dout, y}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dx, MulGradFunctor()); - } else if (dx == nullptr && dy != nullptr) { - std::vector ins = {dout, x}; - GetGradXOrYOut(dev_ctx, place, axis, ins, dout, - dy, MulGradFunctor()); - } -} - } // namespace operators } // namespace paddle @@ -103,44 +76,3 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel>, - ops::ElementwiseMulGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_grad_grad, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel>, - ops::ElementwiseMulDoubleGradKernel>); -REGISTER_OP_CUDA_KERNEL( - elementwise_mul_triple_grad, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel, - ops::ElementwiseMulTripleGradKernel>, - ops::ElementwiseMulTripleGradKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index c81266d584468f51030026e1423a649252001f58..58a3123c7e332f50b0830577436528f1e8df1cdf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -137,244 +137,6 @@ class ElementwiseMulKernel : public framework::OpKernel { } } }; -template -struct MulGradDX { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } -}; - -template -struct MulGradDX> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template -struct MulGradDY { - HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } -}; - -template -struct MulGradDY> { - HOSTDEVICE paddle::platform::complex operator()( - paddle::platform::complex x, paddle::platform::complex y, - paddle::platform::complex out, - paddle::platform::complex dout) const { - paddle::platform::complex x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy) { - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, MulGradDY>( - ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); -} - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template -typename std::enable_if< - std::is_same::value>::type -ElementwiseMulGrad(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - const framework::Tensor* out, const framework::Tensor* dout, - framework::Tensor* dx, framework::Tensor* dy); -#endif - -template -class ElementwiseMulGradKernel : public ElemwiseGradKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* out = dout; // out is not necessary - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - ElementwiseMulGrad(ctx, x, y, out, dout, dx, dy); - } -}; - -template -class ElementwiseMulDoubleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - auto* ddout = ctx.Output("DDOut"); - - if (ddout) ddout->mutable_data(ctx.GetPlace()); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - // dx = dout * ddy - // dy = dout * ddx - // ddout = ddx * y + x * ddy - // change computation sequence to save memory, so ddout can inplace ddx and - // dx can be used as 'tmp' tensor - // (1) dx = x * ddy - // (2) dy = dout * ddx - // (3) ddout = ddx * y - // (4) ddout = ddout + dx - // (5) dx = dout * ddy - if (ddout) { - int axis = ctx.Attr("axis"); - auto& place = - *ctx.template device_context().eigen_device(); - // size(ddout) > size(ddx), ddout can't use memory of ddx using inplace - if (ddout->numel() > ddx->numel()) { - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX(), - MulGradDY()); - - Tensor ddout_tmp; - ddout_tmp.mutable_data(ddout->dims(), ctx.GetPlace()); - - default_elementwise_mul(ctx, y, &ddx_safe, ddout); - default_elementwise_mul(ctx, &ddy_safe, x, - &ddout_tmp); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - } else { - // use dx to save memory, other than alloc tmp tensor - Tensor* ddout_tmp = dx; - - default_elementwise_mul(ctx, x, &ddy_safe, ddout_tmp); - // NOTE: in the following ElemwiseGradCompute, for the - // first output tensor is nullptr, the branch to calculate first - // output tensor will not be activated, DivGradDx function will not - // be called and can be ignored, the first branch has little effect - // on running speed. - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy, - MulGradDX(), MulGradDY()); - default_elementwise_mul(ctx, &ddx_safe, y, ddout); - - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(*ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - default_elementwise_mul(ctx, dout, &ddy_safe, dx); - } - } - } -}; - -template -class ElementwiseMulTripleGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - // get input - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input("DOut"); - auto* ddx = ctx.Input("DDX"); - auto* ddy = ctx.Input("DDY"); - - auto* d_dx = ctx.Input("D_DX"); - auto* d_dy = ctx.Input("D_DY"); - auto* d_ddout = ctx.Input("D_DDOut"); - - // get output - auto* out_d_x = ctx.Output("D_X"); - auto* out_d_y = ctx.Output("D_Y"); - auto* out_d_dout = ctx.Output("D_DOut"); - - auto* out_d_ddx = ctx.Output("D_DDX"); - auto* out_d_ddy = ctx.Output("D_DDY"); - - if (out_d_x) out_d_x->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_y) out_d_y->mutable_data(y->dims(), ctx.GetPlace()); - if (out_d_dout) out_d_dout->mutable_data(dout->dims(), ctx.GetPlace()); - if (out_d_ddx) out_d_ddx->mutable_data(x->dims(), ctx.GetPlace()); - if (out_d_ddy) out_d_ddy->mutable_data(y->dims(), ctx.GetPlace()); - - auto& place = *ctx.template device_context().eigen_device(); - - Tensor ddx_safe, ddy_safe; - GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); - GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - - if (d_ddout) { - if (out_d_x) { - // out_d_x = ddy * d_ddout - default_elementwise_mul(ctx, &ddy_safe, d_ddout, - out_d_x); - } - if (out_d_y) { - // out_d_y = ddx * d_ddout - default_elementwise_mul(ctx, &ddx_safe, d_ddout, - out_d_y); - } - } - - if (out_d_dout) { - // get out_d_dout - // out_d_dout = ddy * d_dx + d_dy * ddx - Tensor out_d_dout_tmp; - out_d_dout_tmp.mutable_data(dout->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, d_dy, &ddx_safe, - out_d_dout); - default_elementwise_mul(ctx, &ddy_safe, d_dx, - &out_d_dout_tmp); - auto out_d_dout_t = framework::EigenVector::Flatten(*out_d_dout); - auto out_d_dout_tmp_t = - framework::EigenVector::Flatten(out_d_dout_tmp); - out_d_dout_t.device(place) = out_d_dout_t + out_d_dout_tmp_t; - } - - if (out_d_ddx) { - // get out_d_ddx - // out_d_ddx = dout * d_dy + y * d_ddout - Tensor out_d_ddx_tmp; - out_d_ddx_tmp.mutable_data(ddx->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dy, out_d_ddx); - default_elementwise_mul(ctx, y, d_ddout, - &out_d_ddx_tmp); - auto out_d_ddx_t = framework::EigenVector::Flatten(*out_d_ddx); - auto out_d_ddx_tmp_t = framework::EigenVector::Flatten(out_d_ddx_tmp); - out_d_ddx_t.device(place) = out_d_ddx_t + out_d_ddx_tmp_t; - } - - if (out_d_ddy) { - // get out_d_ddy - // out_d_ddy = dout * d_dx + x * d_ddout - Tensor out_d_ddy_tmp; - out_d_ddy_tmp.mutable_data(ddy->dims(), ctx.GetPlace()); - default_elementwise_mul(ctx, dout, d_dx, out_d_ddy); - default_elementwise_mul(ctx, x, d_ddout, - &out_d_ddy_tmp); - auto out_d_ddy_t = framework::EigenVector::Flatten(*out_d_ddy); - auto out_d_ddy_tmp_t = framework::EigenVector::Flatten(out_d_ddy_tmp); - out_d_ddy_t.device(place) = out_d_ddy_t + out_d_ddy_tmp_t; - } - } -}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index fc128a88f2096a26141ff7922b1d9166b8302ded..3e9263fe93acd93638ff9e496203b7ea432cea86 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc index 5222103256d614a2d6b1fa10662367ecb20d3cb2..ea009a38056f078689bd6dc4c9a41d2b34e8c1fa 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -17,8 +17,13 @@ #include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_double_grad, GPU, ALL_LAYOUT); +#endif namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc index 9d4d11609ac2047aa8934cb2868f79359a816e12..ce5c6b701d95894db8e3a84215f537352914706a 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc @@ -21,9 +21,12 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc index 7890d634e9941718f3420bd50a7ded453379fc69..3cecc52a3c481cf9cb4a1e2eba6ded704a8fa8ee 100644 --- a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -27,9 +27,15 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_div); +PD_DECLARE_KERNEL(divide_double_grad, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(divide_double_grad, GPU, ALL_LAYOUT); +#endif + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc index f68f670394871114369f8b05b7f958c03d5508d0..64274d098c0585c28196743c09d5e6c78c3fe37d 100644 --- a/paddle/fluid/operators/erf_op.cc +++ b/paddle/fluid/operators/erf_op.cc @@ -16,8 +16,10 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/erf_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -29,18 +31,6 @@ class ErfOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(%s) of ErfOp should not be null.", "X")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(%s) of ErfOp should not be null.", "Out")); - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -116,28 +106,10 @@ class ErfGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(erf, ErfInferShapeFunctor, + PD_INFER_META(phi::UnchangedInferMeta)); REGISTER_OPERATOR(erf, ops::ErfOp, ops::ErfOpMaker, ops::ErfGradOpMaker, - ops::ErfGradOpMaker); + ops::ErfGradOpMaker, + ErfInferShapeFunctor); REGISTER_OPERATOR(erf_grad, ops::ErfGradOp); -REGISTER_OP_CPU_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CPU_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); - -REGISTER_OP_CUDA_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CUDA_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h deleted file mode 100644 index 4780b2e7f5b28d4a743f6d35046891b30cbefd00..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/erf_op.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES -#endif -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -template -class ErfKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* out = context.Output("Out"); - auto* in = context.Input("X"); - out->mutable_data(in->place()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = - *context.template device_context().eigen_device(); - EigenErf, T>::Eval(place, eigen_out, - eigen_in); - } -}; - -template -class ErfGradKernel : public framework::OpKernel { - public: - virtual void Compute(const framework::ExecutionContext& context) const { - auto* x = context.Input("X"); - auto* dout = - context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - - dx->mutable_data(dout->place()); - - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = - *context.template device_context().eigen_device(); - EigenErfGrad, T>::Eval(place, eigen_dx, - eigen_x, eigen_dout); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 119e514a49e28fb3295e36947664770889bbdd81..97a35a34f23e96707269482e29da13a15538cdca 100755 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -121,37 +121,9 @@ REGISTER_OPERATOR(expand_as_v2, ops::ExpandAsV2Op, ops::ExpandAsV2OpMaker, ops::ExpandAsV2GradOpMaker); REGISTER_OPERATOR(expand_as_v2_grad, ops::ExpandAsV2GradOp, ops::ExpandAsV2GradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CPU_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); -#endif REGISTER_OP_VERSION(expand_as_v2) .AddCheckpoint( R"ROC(fix expand_as_v2 and add new input [Y])ROC", paddle::framework::compatible::OpVersionDesc().NewInput( - "Y", "Expand X according to the shape of Y")); \ No newline at end of file + "Y", "Expand X according to the shape of Y")); diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h index d7560efc5c1f1244ae4eed4c68c59a38287057ee..f09e7764eed3959c7f0ca700b953dbd0c2891d12 100755 --- a/paddle/fluid/operators/expand_as_v2_op.h +++ b/paddle/fluid/operators/expand_as_v2_op.h @@ -32,219 +32,5 @@ template using EigenTensor = framework::EigenTensor; -template -class ExpandAsV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - auto target_shape = context.Attr>("target_shape"); - auto target_rank = target_shape.size(); - PADDLE_ENFORCE_GE(target_rank, rank, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be greater than or equal to " - "the rank (%d) of the input 'x'.", - target_rank, rank)); - PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' for " - "expand_as_v2 op must be positive.", - rank)); - PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank (%d) of the input 'target_tensor' for " - "expand_as_v2 op must be less than or equal to %d.", - target_rank, MAX_RANK_SUPPORTED)); - - switch (target_rank) { - case 1: - ExpandAs<1>(context); - break; - case 2: - ExpandAs<2>(context); - break; - case 3: - ExpandAs<3>(context); - break; - case 4: - ExpandAs<4>(context); - break; - case 5: - ExpandAs<5>(context); - break; - case 6: - ExpandAs<6>(context); - break; - } - } - - protected: - template - void ExpandAs(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - auto in_dims = in0->dims(); - auto target_shape = context.Attr>("target_shape"); - auto vec_in_dims = phi::vectorize(in_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - PADDLE_ENFORCE_NE(target_shape[i], 0, - platform::errors::InvalidArgument( - "The value of target shape cannot be zero.")); - if (i < diff) { - PADDLE_ENFORCE_GT( - target_shape[i], 0, - platform::errors::InvalidArgument( - "The expanded size (%d) for non-existing dimensions must be " - "positive for expand_as_v2 op.", - target_shape[i])); - repeat_times[i] = target_shape[i]; - } else if (target_shape[i] > 0) { - if (vec_in_dims[i] != 1) { - PADDLE_ENFORCE_EQ( - vec_in_dims[i], target_shape[i], - platform::errors::InvalidArgument( - "The value (%d) of the non-singleton dimension does not match" - " the corresponding value (%d) in shape for expand_as_v2 op.", - vec_in_dims[i], target_shape[i])); - repeat_times[i] = 1; - } else { - repeat_times[i] = target_shape[i]; - } - } else { - PADDLE_ENFORCE_EQ( - target_shape[i], -1, - platform::errors::InvalidArgument( - "When the value in shape is negative for expand_as_v2 op, " - "only -1 is supported, but the value received is %d.", - target_shape[i])); - repeat_times[i] = 1; - } - } - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims = phi::make_ddim(target_shape); - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } -}; - -template -class ExpandAsV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in0 = context.Input("X"); - auto target_shape = context.Attr>("target_shape"); - auto x_dims = in0->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - auto diff = target_shape.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - std::vector repeat_times(vec_in_dims.size()); - for (size_t i = 0; i < vec_in_dims.size(); ++i) { - repeat_times[i] = target_shape[i] / vec_in_dims[i]; - } - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - framework::TensorCopy(*in0, context.GetPlace(), context.device_context(), - out0); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be greater than or " - "equal to 1, but the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for " - "expand_as_v2_grad op must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void ExpandAsBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index cdd4e1dbaae6a6a74bb11be44589877234021764..df00ae54c1036b1b0f0899eb0a949d58c398aa48 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu index 0eb84f18f25f03b1fd0310c5815ee342ff835a6f..27a235765227f15dd412dcd6ad55f2a24471c6da 100644 --- a/paddle/fluid/operators/feed_forward_test.cu +++ b/paddle/fluid/operators/feed_forward_test.cu @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/fused/attn_feed_forward.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace framework = paddle::framework; @@ -29,6 +30,11 @@ namespace platform = paddle::platform; USE_OP(matmul); USE_OP_ITSELF(elementwise_add); +PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT); +#endif + // get paddle matmul op results as baseline template void GetLinearOp(const std::vector &x, const std::vector &y, diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 79018f2a97448a8c6265a969dad37bce77d1b7ee..cb03add3143278260d41c3893e7adad976908d4e 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { tensor_value.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_value, value); NpuOpRunner runner; -#if (CANN_VERSION_CODE >= 503003) +#if (CANN_VERSION_CODE >= 503003 && CANN_VERSION_CODE < 504001) runner.SetType("FillD") .AddInput(tensor_value) .AddOutput(*out_var) diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..508730c3c7335dbad8cf70417d2c19be4a8480a2 --- /dev/null +++ b/paddle/fluid/operators/filter_by_instag_op.cu @@ -0,0 +1,655 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000 + +#if defined(PADDLE_WITH_CUDA) +#include +#endif + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/operators/filter_by_instag_op.h" + +#if defined(PADDLE_WITH_CUDA) +namespace cg = cooperative_groups; +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = phi::SelectedRows; +using LoDTensor = framework::LoDTensor; + +template +using Vector = framework::Vector; + +#define WARP_SIZE 32 +#define MAX_WARP_NUM 32 + +#if defined(PADDLE_WITH_CUDA) + +template +__global__ void filter_copy_fuse_kernel( + const size_t N, const int ins_per_thread, size_t* x1_lods_data, + size_t* x2_lods_data, const int64_t* x2_data, const int64_t* x3_data, + int64_t filter_tag_size, T* out_data, int64_t* map_data, + size_t* map_lods_data, size_t* out_lods_data, size_t* out_idx_data, + const T* x1_data, int x1_embed_size, float* loss_weight_data, + float fill_value) { + // N is instance num + // one threads for ins_per_thread instances + int idx = blockIdx.x * blockDim.x + threadIdx.x; + + cg::thread_block b = cg::this_thread_block(); + cg::thread_block_tile g = cg::tiled_partition(b); + + int gid = idx / WARP_SIZE; + + // general use + int thread_num = + (N + (ins_per_thread - 1)) / ins_per_thread; // real thread num + int total_warp_num = thread_num / WARP_SIZE; // 30 + int remain_thread_num = thread_num % WARP_SIZE; // 16 + + int warp_thread_num = -1; + if (gid < total_warp_num) { + warp_thread_num = WARP_SIZE; + } else { + warp_thread_num = remain_thread_num; + } + + int group_num = total_warp_num; + if (remain_thread_num > 0) { + group_num = total_warp_num + 1; + } + + if (gid >= group_num) return; + + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (N < ins_end) ins_end = N; + + /* + if (!x1_lods_filled) { + for (int p = ins_start; p < ins_end; p++) { + x1_lods_data[p] = p; + } + if (idx == 0) { + x1_lods_data[N] = N; + } + } + + if (!x2_lods_filled) { + for (int p = ins_start; p < ins_end; p++) { + x2_lods_data[p] = p; + } + if (idx == 0) { + x2_lods_data[N] = N; + } + } + + if (!x1_lods_filled || !x2_lods_filled) { + b.sync(); + } + */ + + int flag_data[5]; + int prefix_sum_data[5]; + int prefix_sum_data2[5]; + + __shared__ int shr[MAX_WARP_NUM]; + __shared__ int shr2[MAX_WARP_NUM]; + __shared__ int shr3[MAX_WARP_NUM]; + + for (int p = ins_start; p < ins_end; p++) { + int ins_tag_start = x2_lods_data[p]; + int ins_tag_end = x2_lods_data[p + 1]; + flag_data[p - ins_start] = 0; + // filter logic + int i = ins_tag_start; + for (; i < ins_tag_end; i++) { + int64_t ins_tag = x2_data[i]; + int j = 0; + for (; j < filter_tag_size; j++) { + if (x3_data[j] == ins_tag) break; + } + // if ins_tag in filter tag + if (j < filter_tag_size) { + flag_data[p - ins_start] = 1; + break; + } + } + } + + int sum_addr = 0; + int sum_flag = 0; + int sum_out_lods = 0; + + int local_addr = 0; + int local_flag = 0; + int local_out_lods = 0; + + if (ins_start < ins_end) { + for (int p = ins_start; p < ins_end; p++) { + int previous = -1; + if (p == ins_start) { + previous = 0; + } else { + previous = prefix_sum_data[p - ins_start - 1]; + } + + prefix_sum_data[p - ins_start] = + previous + + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + local_addr = prefix_sum_data[ins_end - 1 - ins_start]; + sum_addr = local_addr; + + // flag + // local_flag = 0; + for (int p = ins_start; p < ins_end; p++) { + local_flag += flag_data[p - ins_start]; + } + sum_flag = local_flag; + + for (int p = ins_start; p < ins_end; p++) { + local_out_lods += + flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]); + } + + sum_out_lods = local_out_lods; + } + + // 32 threads + for (int i = 1; i < warp_thread_num; i *= 2) { + int temp_addr = g.shfl_up(sum_addr, i); + int temp_flag = g.shfl_up(sum_flag, i); + int temp_out_lods = g.shfl_up(sum_out_lods, i); + + if (g.thread_rank() >= i) { + sum_addr += temp_addr; + sum_flag += temp_flag; + sum_out_lods += temp_out_lods; + } + } + + if (g.thread_rank() == warp_thread_num - 1) { + shr[gid] = sum_addr; + shr2[gid] = sum_flag; + shr3[gid] = sum_out_lods; + } + + b.sync(); + + int sum_addr2 = 0; + int sum_flag2 = 0; + int sum_out_lods2 = 0; + + // communicate between warp + if (g.thread_rank() < group_num) { + sum_addr2 = shr[g.thread_rank()]; + sum_flag2 = shr2[g.thread_rank()]; + sum_out_lods2 = shr3[g.thread_rank()]; + } + + for (int i = 1; i < group_num; i *= 2) { + int temp_addr2 = g.shfl_up(sum_addr2, i); + int temp_flag2 = g.shfl_up(sum_flag2, i); + int temp_out_lods2 = g.shfl_up(sum_out_lods2, i); + + if (g.thread_rank() >= i) { + sum_addr2 += temp_addr2; + sum_flag2 += temp_flag2; + sum_out_lods2 += temp_out_lods2; + } + } + + int sum_addr3 = g.shfl(sum_addr2, gid); + int sum_flag3 = g.shfl(sum_flag2, gid); + int sum_out_lods3 = g.shfl(sum_out_lods2, gid); + + int p_flag; + int p_addr; + int p_out_lods; + + if (ins_start < ins_end) { + p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr; + p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag; + p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods; + + for (int p = ins_start; p < ins_end; p++) { + if (ins_start == p) { + prefix_sum_data2[p - ins_start] = p_addr; + } else { + prefix_sum_data2[p - ins_start] = + prefix_sum_data2[p - ins_start - 1] + + flag_data[p - ins_start - 1] * + (x1_lods_data[p] - x1_lods_data[p - 1]); + } + } + + if (gid == 0 && g.thread_rank() == group_num - 1) { + *out_idx_data = (sum_flag2 + 1); + map_lods_data[sum_flag2] = sum_flag2; + } + } + + int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1); + + if (ins_start < ins_end) { + int out_lods_idx = p_flag + 1; + + // ins_start = 1 + // BUG fix + for (int p = ins_start; p < ins_end; p++) { + if (flag_data[p - ins_start] == 1) { + // batch_len = 2 + // batch_len = 4 + size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p]; + // t = 0 + // t = 1 + int t = out_lods_idx - 1; + // out_lods_data[0] = 0; + int previous; + + if (out_lods_idx == p_flag + 1) { + // out_lods_data[t] = p_out_lods; + previous = p_out_lods; + } else { + previous = out_lods_data[t]; + } + + map_data[t * 3] = (int64_t)previous; + map_data[t * 3 + 1] = x1_lods_data[p]; + map_lods_data[t] = t; + out_lods_data[out_lods_idx] = previous + batch_len; + map_data[t * 3 + 2] = batch_len; + out_lods_idx++; + } + } + + // fill loss_weight_data + if (sum_out_lods4 > 1) { + int out_data_num = sum_out_lods4 - 1; + int out_start = ins_start; + + if (out_start < out_data_num) { + int out_end = ins_end >= out_data_num ? out_data_num : ins_end; + for (int p = out_start; p < out_end; p++) { + loss_weight_data[p] = fill_value; + } + } + } + + for (int p = ins_start; p < ins_end; p++) { + // copy logic + if (flag_data[p - ins_start] == 1) { + auto output_start_idx = prefix_sum_data2[p - ins_start]; + T* dst = out_data + output_start_idx * x1_embed_size; + + const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size; + const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size; + + // optimized + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } + } + } + + b.sync(); +} + +template +__global__ void copy_grad_kernel(const size_t N, const int ins_per_thread, + const T* out_grad_data, T* x1_grad_data, + const int64_t* map_data, int x1_embed_size) { + // N is instance num + // one threads for one instance + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int ins_start = idx * ins_per_thread; + int ins_end = (idx + 1) * ins_per_thread; + + if (ins_start >= N) { + return; + } + if (ins_end > N) ins_end = N; + + for (int p = ins_start; p < ins_end; p++) { + T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size; + const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size; + const T* src_end = + out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size; + + for (const T *j = src_start; j != src_end; dst++, j++) { + *dst = *j; + } + } +} + +#endif + +template +class FilterByInstagGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + + gpuStream_t current_stream = context.cuda_device_context().stream(); + + int max_thread_num_per_block = 1024; + // context.cuda_device_context().GetMaxThreadsPerBlock(); + // X1 is global FC output + // Dim [batch size, embedding size] + const LoDTensor* x1 = context.Input("Ins"); + bool is_lod = context.Attr("is_lod"); + + int is_x1_lod = -1; + if (is_lod) + is_x1_lod = 1; + else + is_x1_lod = 0; + + int64_t out_val_if_empty = context.Attr("out_val_if_empty"); + size_t x1_embed_size = x1->dims()[1]; + // X2 is ins tag list + // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] + const LoDTensor* x2 = context.Input("Ins_tag"); + // expected auto = const int64_t + const int64_t* x2_data = x2->data(); + + // X3 is local fc tag list + // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] + const Tensor* x3 = context.Input("Filter_tag"); + const int64_t* x3_data = x3->data(); + + // int x2_lods_filled = 1; + + Vector x2_lods; + // Vector, in GPU + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + // x2_lods_filled = 1; + + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + // x2_lods.resize(x2->dims()[0] + 1); + // move to cuda + x2_lods.push_back(0); + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(i + 1); + } + } + + const size_t x2_lods_size = x2_lods.size() - 1; + paddle::framework::MixVector mixv_x2_lods(&x2_lods); + + size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place); + + // Vector, in GPU + // int x1_lods_filled = 1; + Vector x1_lods; + + if (!is_x1_lod) { + // move to cuda + // x1_lods.resize(x1->dims()[0] + 1); + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } else { + // x1_lods = context.Input("Ins")->lod()[0]; + // new: lod_level=0 => lod() return {} + if (x1->lod().size() != 0) { // lod_level = 1 + // x1_lods_filled = 1; + x1_lods = x1->lod()[0]; + } else { // lod_level = 0 + // x1_lods.resize(x1->dims()[0] + 1); + // move to cuda + x1_lods.push_back(0); + for (int i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } + } + + paddle::framework::MixVector mixv_x1_lods(&x1_lods); + + size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place); + auto* x1_data = x1->data(); + + // set output value + // for those whose ins been dropout, set 0 for whole lines. + // otherwise, copy whole line + // Dim [local fc count, batch size, embedding size] + LoDTensor* out = context.Output("Out"); + LoDTensor* map = context.Output("IndexMap"); + LoDTensor* loss_weight = context.Output("LossWeight"); + + int out_first = x1_lods.back(); + // int out_first = x1->dims()[0]; + // if (x1_lods_filled) { + // out_first = x1_lods.back(); + // } + + out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3})); + loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1})); + + T* out_data = out->mutable_data(gpu_place); + int64_t* map_data = map->mutable_data(gpu_place); + float* loss_weight_data = loss_weight->mutable_data(gpu_place); + + int block_size = max_thread_num_per_block; + int ins_per_thread = (x2_lods_size + block_size - 1) / block_size; + dim3 block_dim(block_size); + dim3 grid_dim(1); + + Vector out_lods(x2_lods_size + 1, 0); + Vector map_lods(x2_lods_size + 1, 0); + + paddle::framework::MixVector mixv_out_lods(&out_lods); + paddle::framework::MixVector mixv_map_lods(&map_lods); + + // thrust::device_vector out_idx(1); + Vector out_idx(1, 0); + paddle::framework::MixVector mixv_out_idx(&out_idx); + + size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place); + size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place); + size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place); + + float fill_value = 1.0; + + filter_copy_fuse_kernel<<>>( + x2_lods_size, ins_per_thread, x1_lods_data, x2_lods_data, x2_data, + x3_data, x3->numel(), out_data, map_data, map_lods_data, out_lods_data, + out_idx_data, x1_data, x1_embed_size, loss_weight_data, fill_value); + + platform::GpuStreamSync(current_stream); + + mixv_out_lods.resize(mixv_out_idx[0]); + + if (mixv_out_lods.size() - 1 > 0) { + out->Resize(phi::make_ddim( + {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size})); + + map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3})); + loss_weight->Resize( + phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1})); + + } else { + out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size})); + map->Resize(phi::make_ddim({1, 3})); + loss_weight->Resize(phi::make_ddim({1, 1})); + } + + if (mixv_out_lods.size() - 1 > 0) { + map_lods.resize(mixv_out_lods.size()); + + mixv_map_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + + map->set_lod(map_lod_info); + loss_weight->set_lod(map_lod_info); + + mixv_out_lods.CopyToCPU(); + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + } else { + Vector map_lods(2, 0); + paddle::framework::MixVector mixv_map_lods(&map_lods); + thrust::device_ptr map_data_ptr(map_data); + + map_data_ptr[0] = 0; + map_data_ptr[1] = 1; + map_data_ptr[2] = 1; + + mixv_map_lods[0] = 0; + mixv_map_lods[1] = 1; + mixv_out_lods.push_back(1); + + mixv_map_lods.CopyToCPU(); + mixv_out_lods.CopyToCPU(); + + std::vector> map_lod_info; + map_lod_info.emplace_back(map_lods); + map->set_lod(map_lod_info); + + loss_weight->set_lod(map_lod_info); + + std::vector> out_lod_info; + out_lod_info.emplace_back(out_lods); + out->set_lod(out_lod_info); + + thrust::device_ptr out_data_ptr(out_data); + + // gpu kernel + if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else if (std::is_same::value) { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } else { + thrust::fill(out_data_ptr, out_data_ptr + out->numel(), + static_cast(out_val_if_empty)); + } + + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + loss_weight_data_ptr[0] = 0; + } + +#endif + } +}; + +template +class FilterByInstagGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { +#if defined(PADDLE_WITH_CUDA) + + auto gpu_place = context.GetPlace(); + gpuStream_t current_stream = context.cuda_device_context().stream(); + auto max_thread_num_per_block = 1024; + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* x1_grad = context.Output(framework::GradVarName("Ins")); + auto* loss_weight = context.Input("LossWeight"); + auto* mmap = context.Input("IndexMap"); + auto* x1 = context.Input("Ins"); + + x1_grad->set_lod(context.Input("Ins")->lod()); + x1_grad->Resize(x1->dims()); + + auto* mmap_data = mmap->data(); + // expected auto = T + auto* output_grad_data = output_grad->data(); + auto* loss_weight_data = loss_weight->data(); + + // expected auto = T + auto* x1_grad_data = x1_grad->mutable_data(gpu_place); + thrust::device_ptr x1_grad_data_ptr(x1_grad_data); + thrust::device_ptr loss_weight_data_ptr(loss_weight_data); + + thrust::fill(x1_grad_data_ptr, + x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0); + + if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) { + auto output_dims = output_grad->dims(); + int x1_embed_size = output_dims[1]; + + // one thread for multi-instances + int block_size = max_thread_num_per_block; + + size_t N = mmap->dims()[0]; + dim3 block_dim(block_size); + + dim3 grid_dim((N + block_size - 1) / block_size); + + const int ins_per_thread = 1; + + copy_grad_kernel<<>>( + N, ins_per_thread, output_grad_data, x1_grad_data, mmap_data, + x1_embed_size); + + cudaStreamSynchronize(current_stream); + } + +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL(filter_by_instag, ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel, + ops::FilterByInstagGPUKernel); + +REGISTER_OP_CUDA_KERNEL(filter_by_instag_grad, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel, + ops::FilterByInstagGradGPUKernel); diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h index deb2aa96b539e360cf2edad97b21cb6e9ddba066..3abc980ceaafc3719c13cad51c346282be2c694f 100644 --- a/paddle/fluid/operators/filter_by_instag_op.h +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -61,7 +61,20 @@ class FilterByInstagKernel : public framework::OpKernel { // expected auto = const int64_t auto* x2_data = x2->data(); // e.g get [0, 1, 2, 3, ...] - size_t x2_lods_size = x2->dims()[0]; + // size_t x2_lods_size = x2->dims()[0]; + // size_t instag_num_per_ins = x2->dims()[1]; + + Vector x2_lods(1, 0); + if (x2->lod().size() != 0) { // lod_level = 1 + x2_lods = x2->lod()[0]; + } else { // lod_level = 0 + const size_t x2_lods_size = x2->dims()[0]; + const size_t instag_num_per_ins = x2->dims()[1]; + for (size_t i = 0; i < x2_lods_size; i++) { + x2_lods.push_back(x2_lods.back() + instag_num_per_ins); + } + } + Vector x1_lods(1, 0); if (!is_x1_lod) { for (int i = 0; i < x1->dims()[0]; i++) { @@ -79,8 +92,8 @@ class FilterByInstagKernel : public framework::OpKernel { } std::unordered_map mmap_aux; Vector out_lods(1, 0); - for (size_t i = 0; i < x2_lods_size; i++) { - for (size_t j = i; j < i + 1; j++) { + for (size_t i = 0; i < x2_lods.size() - 1; i++) { + for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { if (filter_tag.find(x2_data[j]) != filter_tag.end()) { size_t batch_len = x1_lods[i + 1] - x1_lods[i]; mmap_aux[out_lods.back()] = x1_lods[i]; @@ -165,8 +178,10 @@ class FilterByInstagKernel : public framework::OpKernel { out_data[oi] = (int32_t)out_val_if_empty; } else if (std::is_same::value) { out_data[oi] = (int64_t)out_val_if_empty; - } else { + } else if (std::is_same::value) { out_data[oi] = static_cast(out_val_if_empty); + } else { + out_data[oi] = static_cast(out_val_if_empty); } } loss_weight_data[0] = 0; diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc index 40ec9aef190ff4bacd52b19a1c0b12300a35b61e..92f59e118c3b7bb66a2c5c76d66109ddf04ee076 100644 --- a/paddle/fluid/operators/fold_op.cc +++ b/paddle/fluid/operators/fold_op.cc @@ -95,6 +95,17 @@ class FoldOp : public framework::OperatorWithKernel { "but recieved strides_height: %d strides_width: %d.", strides[0], strides[1])); // check dilations + PADDLE_ENFORCE_GT(output_height, 1, + platform::errors::InvalidArgument( + "The `output_height` should be greater than one, " + "but recieved output_height: %d .", + output_height)); + PADDLE_ENFORCE_GT(output_width, 1, + platform::errors::InvalidArgument( + "The `output_width` should be greater than one, " + "but recieved output_width: %d .", + output_width)); + // check output size PADDLE_ENFORCE_GT( dilation_height, 0, platform::errors::InvalidArgument( @@ -146,7 +157,7 @@ class FoldOp : public framework::OperatorWithKernel { output_width)); PADDLE_ENFORCE_EQ( - blocks_height * blocks_width, in_dims[1], + blocks_height * blocks_width, in_dims[2], platform::errors::InvalidArgument( "Given input output_size (%d, %d), " "kernel_sizes (%d, %d), strides (%d, %d), dilations (%d, %d), " @@ -156,6 +167,15 @@ class FoldOp : public framework::OperatorWithKernel { strides[0], strides[1], dilations[0], dilations[1], blocks_height, blocks_width, blocks_height * blocks_width, in_dims[2])); + PADDLE_ENFORCE_EQ( + in_dims[1] % (kernel_sizes[0] * kernel_sizes[1]), 0, + platform::errors::InvalidArgument( + "Expected size of input's dimension 1 to be divisible by the" + "product of kernel_size, but got input.size(1)=%d and " + "kernel_size=( %d" + ", %d).", + in_dims[1], kernel_sizes[0], kernel_sizes[1])); + out_dims.push_back(output_height); out_dims.push_back(output_width); ctx->SetOutputDim("Y", phi::make_ddim(out_dims)); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index b3792a176fabeb8406fd2f1b83c6723207dad2f1..a80f590aa495db8090a30118ed4128843c0f8860 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -405,8 +405,18 @@ TEST(CudnnNormConvFp16, K1S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 3, output_channels = input_channels @@ -421,8 +431,18 @@ TEST(CudnnNormConvFp16, K3S1) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, output_channels = input_channels * 4 @@ -437,8 +457,18 @@ TEST(CudnnNormConvFp16, K1S1O4) { CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.CheckForward(1e-3, true); - test.CheckBackward(1e-3, true); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3, true), + paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3, true)); + } } // test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 020277675797358bf87a58ac108e6eaaddb26ccc..54e4cbdc1624921e6946210a6a192d10fcbdb7dd 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/transpose_op.cu.h" +#include "paddle/phi/kernels/funcs/concat_and_split_functor.h" #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { @@ -69,20 +70,21 @@ class FMHARef { ~FMHARef() {} void ComputeForward(const Tensor& qkv_input_tensor, + const Tensor* cache_kv_tensor, const Tensor* src_mask_tensor, - Tensor* transpose_2_out_tensor, Tensor* qk_out_tensor, + Tensor* transpose_2_out_tensor, + Tensor* cache_kv_out_tensor, Tensor* qk_out_tensor, Tensor* src_mask_out_tensor, Tensor* softmax_out_tensor, Tensor* dropout_mask_out_tensor, Tensor* dropout_out_tensor, Tensor* qktv_out_tensor, Tensor* fmha_out_tensor) { // input shape: [bs, seq_len, 3, num_head, head_dim] - // transpose with perm [2, 0, 1, 3, 4], + // transpose with perm [2, 0, 3, 1, 4], // output_shape: [3, bs, num_head, seq_len, head_dim] int ndims = 5; std::vector perm_1 = {2, 0, 3, 1, 4}; TransposeGPUKernelDriver(dev_ctx_, ndims, qkv_input_tensor, perm_1, transpose_2_out_tensor); - T* qkv_data = transpose_2_out_tensor->data(); T* qk_out_data = qk_out_tensor->data(); T* qktv_out_data = qktv_out_tensor->data(); @@ -90,11 +92,30 @@ class FMHARef { T* dropout_out_data = dropout_out_tensor->data(); T* fmha_out_data = fmha_out_tensor->data(); - int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; - int k_size = q_size; + auto out_seq_len = seq_len_; + if (cache_kv_tensor) { + // kv [2, bs, num_head, seq_len, head_dim] + auto kv_tensor = transpose_2_out_tensor->Slice(1, 3); + phi::funcs::ConcatFunctor concat; + // out [2, bs, num_head, cache_seq_len + seq_len, head_dim] + concat(dev_ctx_, {*cache_kv_tensor, kv_tensor}, 3, cache_kv_out_tensor); + out_seq_len = cache_kv_out_tensor->dims()[3]; + } + + int64_t q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; T* q_ptr = qkv_data; - T* k_ptr = q_ptr + q_size; - T* v_ptr = k_ptr + k_size; + T* k_ptr = nullptr; + T* v_ptr = nullptr; + + if (cache_kv_tensor) { + int64_t k_size = cache_kv_out_tensor->numel() / 2; + k_ptr = cache_kv_out_tensor->data(); + v_ptr = k_ptr + k_size; + } else { + int64_t k_size = q_size; + k_ptr = q_ptr + q_size; + v_ptr = k_ptr + k_size; + } // q*k^t, batched_gemm CBLAS_TRANSPOSE transA = CblasNoTrans; @@ -102,7 +123,7 @@ class FMHARef { auto blas = phi::funcs::GetBlas(dev_ctx_); int gemm_batch_size = batch_size_ * num_head_; int gemm_m = seq_len_; - int gemm_n = seq_len_; + int gemm_n = out_seq_len; int gemm_k = head_dim_; T alpha = static_cast(1.0 / sqrt(head_dim_)); T beta = static_cast(0.0); @@ -133,16 +154,16 @@ class FMHARef { transB = CblasNoTrans; gemm_m = seq_len_; gemm_n = head_dim_; - gemm_k = seq_len_; + gemm_k = out_seq_len; alpha = static_cast(1.0); stride_a = gemm_m * gemm_k; stride_b = gemm_k * gemm_n; if (dropout_param_.dropout_prob_) { DropoutFwGPUKernelDriver( - dev_ctx_, dropout_param_.is_test_, - static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + dropout_param_.is_test_, static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, dropout_param_.is_upscale_in_train_, dropout_param_.is_fix_seed_, dropout_param_.seed_val_, static_cast(*softmax_out_tensor), dropout_param_.seed_, @@ -242,8 +263,9 @@ class FMHARef { // dropout bw if (dropout_param_.dropout_prob_) { DropoutGradGPUKernelDriver( - dev_ctx_, static_cast( - dropout_param_.dropout_implementation_), + static_cast(dev_ctx_), + static_cast( + dropout_param_.dropout_implementation_), dropout_param_.dropout_prob_, static_cast(*dropout_out_grad_tensor), dropout_mask_out_tensor, softmax_out_grad_tensor->numel(), diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc index d141800d61c0ec0b73fe2cc3c8d00dbf1de44cf2..e473f8ff0662cfc3fd7bdc5010bfa1dc08fba85f 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cc +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -61,6 +61,10 @@ class FusedAttentionOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut", "FusedAttentionOp"); + if (ctx->HasInput("CacheKV")) { + OP_INOUT_CHECK(ctx->HasOutput("CacheKVOut"), "Output", "CacheKVOut", + "FusedAttentionOp"); + } if (ctx->HasInput("SrcMask")) { OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut", "FusedAttentionOp"); @@ -105,12 +109,14 @@ class FusedAttentionOp : public framework::OperatorWithKernel { "input qkv_weight = [%s]", x_dim, y_dim)); - PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], - platform::errors::InvalidArgument( - "The dimensions of qkv_weight must be 4" - "(3, num_head, dim_head, dim_embed)," - "and must satisfy the limitations: " - "(num_head * dim_head == dim_embed)")); + if (ctx->Attrs().Get("ring_id") == -1) { + PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2], y_dim[3], + platform::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4" + "(3, num_head, dim_head, dim_embed)," + "and must satisfy the limitations: " + "(num_head * dim_head == dim_embed)")); + } if (ctx->Attrs().Get("pre_layer_norm") == true) { ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]}); @@ -132,20 +138,64 @@ class FusedAttentionOp : public framework::OperatorWithKernel { // [3, batch_size, num_head, seq_len, head_size] ctx->SetOutputDim("TransposeOut2", {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); - // [batch, num_head, seq_len, seq_len] - ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + + // cache_seq_len + seq_len if cache else seq_len + auto out_seq_len = x_dim[1]; + if (ctx->HasInput("CacheKV")) { + // [2, batch_size, num_head, cache_seq_len, head_size] + auto c_dim = ctx->GetInputDim("CacheKV"); + + PADDLE_ENFORCE_EQ( + c_dim.size(), 5, + paddle::platform::errors::InvalidArgument( + "The CacheKV must be 5 dims, but got %d", c_dim.size())); + PADDLE_ENFORCE_EQ(c_dim[0], 2, + paddle::platform::errors::InvalidArgument( + "The first dim of CacheKV must be 2, but got %d", + c_dim[0])); // 2 + PADDLE_ENFORCE_EQ(c_dim[1], x_dim[0], + paddle::platform::errors::InvalidArgument( + "The second dim of CacheKV must be equal with " + "batch size %d, but got %d", + x_dim[0], c_dim[1])); // batch_size + PADDLE_ENFORCE_EQ(c_dim[2], y_dim[1], + paddle::platform::errors::InvalidArgument( + "The third dim of CacheKV must be equal with num " + "head %d, but got %d", + y_dim[1], c_dim[2])); // num_head + PADDLE_ENFORCE_GE( + c_dim[3], 0, + paddle::platform::errors::InvalidArgument( + "The forth dim of CacheKV must be greater than 0, but got %d", + c_dim[3])); // cache_seq_len + PADDLE_ENFORCE_EQ(c_dim[4], y_dim[2], + paddle::platform::errors::InvalidArgument( + "The fifth dim of CacheKV must be equal with head " + "size %d, but got %d", + y_dim[2], c_dim[4])); // head_size + + out_seq_len += c_dim[3]; + // [3, batch_size, num_head, cache_seq_len + seq_len, head_size] + ctx->SetOutputDim("CacheKVOut", + {c_dim[0], c_dim[1], c_dim[2], out_seq_len, c_dim[4]}); + } + + // [batch, num_head, seq_len, out_seq_len] + ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->HasInput("SrcMask")) { - ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SrcMaskOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } // the same as QKOut's shape. ctx->SetOutputDim("AttnDropoutOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); if (ctx->Attrs().Get("attn_dropout_is_test") == false) { ctx->SetOutputDim("AttnDropoutMaskOut", - {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); } - ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SoftmaxOut", + {x_dim[0], y_dim[1], x_dim[1], out_seq_len}); // [batch_size, num_heads, seq_len, head_dim] ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); // [batch_size, seq_len, number of heads*head size] @@ -182,6 +232,8 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable(); AddInput("QKVW", "The qkv weight tensor."); AddInput("QKVBias", "The qkv bias tensor.").AsDispensable(); + AddInput("CacheKV", "(optional) The cached KV for generation inference.") + .AsDispensable(); AddInput("SrcMask", "(optional) The attention mask tensor in fmha.") .AsDispensable(); AddInput("OutLinearW", "The out_linear weight tensor."); @@ -217,6 +269,7 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("BiasDropoutResidualOut", "Result of residual + dropout(src + bias).") .AsIntermediate(); + AddOutput("CacheKVOut", "The udpated cache KV."); AddOutput("Y", "Result after attention."); AddAttr("pre_layer_norm", @@ -324,6 +377,10 @@ class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { "0.0 and 0.001, But received [%s].", ln_epsilon)); }); + AddAttr( + "ring_id", + "ring id for tensor model parallel. distributed training and inference") + .SetDefault(-1); AddComment(R"DOC( Add fused attention op whose logic is as follows: diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 03f51fc5857985902c21ad12fefbdc9cdec6ef04..d26577f06fe683fb1528c61b4401b9e578c90c9f 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -27,11 +27,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fmha_ref.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor &tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext &ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void *sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void *recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedAttentionOpKernel : public framework::OpKernel { public: @@ -56,6 +84,8 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *src_mask = ctx.Input("SrcMask"); auto *transpose_out_2 = ctx.Output("TransposeOut2"); + auto *cache_kv = ctx.Input("CacheKV"); + auto *cache_kv_out = ctx.Output("CacheKVOut"); auto *qk_out = ctx.Output("QKOut"); auto *qktv_out = ctx.Output("QKTVOut"); auto *softmax_out = ctx.Output("SoftmaxOut"); @@ -86,6 +116,7 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // final output. auto *out = ctx.Output("Y"); @@ -105,6 +136,10 @@ class FusedAttentionOpKernel : public framework::OpKernel { // get data ptr for FMHA. auto *transpose_out_2_data = transpose_out_2->mutable_data(ctx.GetPlace()); + auto *cache_kv_out_data = + (cache_kv_out == nullptr) + ? nullptr + : cache_kv_out->mutable_data(ctx.GetPlace()); auto *qk_out_data = qk_out->mutable_data(ctx.GetPlace()); auto *qktv_out_data = qktv_out->mutable_data(ctx.GetPlace()); auto *src_mask_out_data = @@ -161,9 +196,14 @@ class FusedAttentionOpKernel : public framework::OpKernel { output_size = hidden_size; // (transA, transB, compute_bias) = (false, false, false) + // NOTE(Yuang Liu): For general input size == output size, change the + // position won't have effects. For mp, the output size is mp_head * dkey + // which is actually the input size. While the input size is hidden size, + // which is actually the output size. So for out linear, switch the + // input size and output size. auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), false, false, bsz_seq, - output_size, input_size, false); + input_size, output_size, false); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -186,15 +226,15 @@ class FusedAttentionOpKernel : public framework::OpKernel { qkv_bias_out); } if (qkv_bias == nullptr) { - fmha_ref_compute.ComputeForward(*qkv_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, qk_out, + src_mask_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, + qktv_out, fmha_out); } else { - fmha_ref_compute.ComputeForward(*qkv_bias_out, src_mask, transpose_out_2, - qk_out, src_mask_out, softmax_out, - attn_dropout_mask_out, attn_dropout_out, - qktv_out, fmha_out); + fmha_ref_compute.ComputeForward( + *qkv_bias_out, cache_kv, src_mask, transpose_out_2, cache_kv_out, + qk_out, src_mask_out, softmax_out, attn_dropout_mask_out, + attn_dropout_out, qktv_out, fmha_out); } // fmha_out: [batch_size, seq_len, num_head, head_dim] @@ -202,6 +242,9 @@ class FusedAttentionOpKernel : public framework::OpKernel { // out_linear_out: [batch_size, seq_len, embed_dim] out_linear_compute.ComputeForward(out_linear_weight, fmha_out, nullptr, out_linear_out, nullptr); + // tensor model parallel + AllReduce(*out_linear_out, ring_id, ctx.cuda_device_context()); + if (pre_layer_norm) { // output = (residual + dropout(input + bias)) fused_dropout_layernorm_helper.ResidualDropoutBias( @@ -244,6 +287,7 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); int seed_val_1 = ctx.Attr("attn_dropout_seed"); + int ring_id = ctx.Attr("ring_id"); // get inputs. auto *d_y = ctx.Input(framework::GradVarName("Y")); @@ -399,9 +443,10 @@ class FusedAttentionGradKernel : public framework::OpKernel { transA = false; transB = false; bool compute_bias = false; + // (b*s, num_head * dim_head) * (num_head * dim_head, dim_embed) auto out_linear_compute = AttnMatMul(ctx.cuda_device_context(), transA, transB, bsz_seq, - output_size, input_size, compute_bias); + input_size, output_size, compute_bias); DropoutParam dropout_param2(ctx, 0); FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, @@ -475,6 +520,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(ln_out, qkv_weight, d_qkv_out, d_ln_out, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_ln_out, ring_id, ctx.cuda_device_context()); layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data, ln_mean_data, ln_var_data, d_x_data, d_ln_scale_data, d_ln_bias_data); @@ -486,6 +533,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { qkv_compute.ComputeBackward(input_x, qkv_weight, d_qkv_out, d_x, d_qkv_weight, d_qkv_bias); } + // tensor model parallel + AllReduce(*d_x, ring_id, ctx.cuda_device_context()); } // gradient accumulation std::vector ins; diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu index 2381b5b7fdfb85cbaa3fd66a10c5b630bb515f15..717c1732b7b3acf8528887aae43471c0dc0716e3 100644 --- a/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_dropout_act_bias_test.cu @@ -20,8 +20,14 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" #include "paddle/fluid/operators/fused/fused_dropout_test.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/functors.h" +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace details = paddle::operators::details; diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h index d7952df470d81566c3833e79e8cfa31a7d2dc68c..18c7187fc8e64c9fed8a86a984954b5420c1e5b5 100644 --- a/paddle/fluid/operators/fused/fused_dropout_test.h +++ b/paddle/fluid/operators/fused/fused_dropout_test.h @@ -31,7 +31,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; namespace memory = paddle::memory; -USE_OP(dropout); +USE_OP_ITSELF(dropout); USE_OP(layer_norm); template diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc index 0c8eae4260441f6c873b48735a01b967b70ef4bb..f3f8f1742757783a082437638f67407700963eb1 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cc +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -195,6 +195,8 @@ class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(false); AddAttr("dropout1_seed", "Dropout1 random seed.").SetDefault(0); AddAttr("dropout2_seed", "Dropout2 random seed.").SetDefault(0); + AddAttr("ring_id", "ring id for tensor model parallel.") + .SetDefault(-1); AddComment(R"DOC( the function of fused_feedforward operator is the same as the following pseudo code: residual = src; diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 3131269955bdd17a0552836121589d8edeb4a38e..c38d9f7d4bcbd25b3111b35a918de0f4ebdabefb 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -21,11 +21,39 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" +#endif + namespace paddle { namespace operators { using Tensor = framework::Tensor; +template +static void AllReduce(framework::Tensor& tensor, // NOLINT + const int ring_id, + const platform::CUDADeviceContext& ctx) { + if (ring_id == -1) return; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto dtype = + platform::ToNCCLDataType(framework::TransToProtoVarType(tensor.dtype())); + int64_t numel = tensor.numel(); + const void* sendbuff = tensor.data(); + auto place = ctx.GetPlace(); + void* recvbuff = tensor.mutable_data(place); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + auto stream = ctx.stream(); + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "PaddlePaddle should compile with NCCL or RCCL when used tensor model " + "parallel op.")); +#endif +} + template class FusedFeedForwardKernel : public framework::OpKernel { public: @@ -56,7 +84,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor* dropout1_out, framework::Tensor* dropout2_out, const int bsz_seq, const int d_model, const int dim_feedforward, const std::string& act_method, const bool pre_layer_norm, - const float epsilon1, const float epsilon2, + const float epsilon1, const float epsilon2, const int ring_id, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const platform::CUDADeviceContext& ctx) const { @@ -95,6 +123,10 @@ class FusedFeedForwardKernel : public framework::OpKernel { framework::Tensor linear2_out; linear2_out.mutable_data({bsz_seq, d_model}, place); MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + + // tensor model parallel + AllReduce(linear2_out, ring_id, ctx); + if (!pre_layer_norm) { fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx, linear2_out.data(), x.data(), linear2_bias_ptr, @@ -150,6 +182,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -186,7 +219,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance, linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model, dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2, - dropout_param1, dropout_param2, context.cuda_device_context()); + ring_id, dropout_param1, dropout_param2, context.cuda_device_context()); } }; @@ -231,7 +264,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const int dim_feedforward, const DropoutParam& dropout_param1, const DropoutParam& dropout_param2, const std::string& act_method, const bool pre_layer_norm, const float epsilon1, const float epsilon2, - const platform::CUDADeviceContext& ctx) const { + const int ring_id, const platform::CUDADeviceContext& ctx) const { FusedDropoutLayerNormHelper pre_layernorm_helper( bsz_seq, d_model, epsilon1); FusedDropoutHelper fused_act_dropout_helper( @@ -295,13 +328,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_ln1_out.mutable_data({bsz_seq, d_model}, place); MatMulGrad(ctx, d_linear1_out, *ln1_out, linear1_weight, &d_ln1_out, d_linear1_weight); - + // tensor model parallel + AllReduce(d_ln1_out, ring_id, ctx); pre_layernorm_helper.LayerNormGrad( ctx, d_ln1_out.data(), x.data(), ln1_gamma_ptr, ln1_mean->data(), ln1_variance->data(), d_x->data(), d_ln1_gamma_ptr, d_ln1_beta_ptr); } else { MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + // tensor model parallel + AllReduce(*d_x, ring_id, ctx); } std::vector ins(2); std::vector outs(1); @@ -376,6 +412,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { const float epsilon1 = context.Attr("ln1_epsilon"); const float epsilon2 = context.Attr("ln2_epsilon"); + const int ring_id = context.Attr("ring_id"); const std::string act_method = context.Attr("act_method"); DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); @@ -419,7 +456,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale, d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model, dim_feedforward, dropout_param1, dropout_param2, act_method, - pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context()); + pre_layer_norm, epsilon1, epsilon2, ring_id, + context.cuda_device_context()); } }; } // namespace operators diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu index cc14d0680d381ff2bbe73ee712e218c9c4d79185..032440d7f0478dc087e3ba38274f2a31a9a66a23 100644 --- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif /** * @brief The unit test of fused_layernorm_residual_dropout_bias diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu index 1a12e6b565f02035b3fb9673636c2344823f288e..5dff5e2225f4f3bf3a20daa02b2b4194bd8cb99e 100644 --- a/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu +++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias_test.cu @@ -19,6 +19,12 @@ limitations under the License. */ #include "paddle/fluid/operators/fused/fused_dropout_test.h" #include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/phi/core/kernel_registry.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PD_DECLARE_KERNEL(dropout, GPU, ALL_LAYOUT); +PD_DECLARE_KERNEL(dropout_grad, GPU, ALL_LAYOUT); +#endif namespace framework = paddle::framework; namespace platform = paddle::platform; diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 6b559885c569d001233525c3d964fff2175950e3..66eecc13d04d1aa7d4532b69f7a2fbe8c62b8e6f 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -15,12 +15,14 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/fill_constant_op.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -54,38 +56,6 @@ class GaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GaussianRandom"); - - auto shape = ctx->Attrs().Get>("shape"); - std::vector temp; - temp.reserve(shape.size()); - for (auto dim : shape) { - temp.push_back(static_cast(dim)); - } - if (shape.empty() && ctx->HasInput("ShapeTensor")) { - auto shape_dims = ctx->GetInputDim("ShapeTensor"); - int num_ele = 1; - for (int i = 0; i < shape_dims.size(); ++i) { - num_ele *= shape_dims[i]; - } - auto vec_dims = std::vector(num_ele, -1); - ctx->SetOutputDim("Out", phi::make_ddim(vec_dims)); - - return; - } - if (!ctx->HasInput("ShapeTensor") && !ctx->HasInputs("ShapeTensorList")) { - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "Attribute(shape) of GaussianRandomOp must be set " - "and shape.size() > 0, but reveived shape.size() is %d", - shape.size())); - } - - ctx->SetOutputDim("Out", phi::make_ddim(temp)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -171,11 +141,20 @@ Used to initialize tensors with gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp, - ops::GaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR(gaussian_random, GaussianRandomInferShapeFunctor, + PD_INFER_META(phi::GaussianRandomInferMeta)); + +REGISTER_OPERATOR( + gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + GaussianRandomInferShapeFunctor); + REGISTER_OP_CPU_KERNEL(gaussian_random_batch_size_like, ops::CPUGaussianRandomBatchSizeLikeKernel, ops::CPUGaussianRandomBatchSizeLikeKernel); + REGISTER_OP_VERSION(gaussian_random) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index 00ff7ad2166dcf99d7b60ec45adfe70b478dedf8..f3ac53138328dbfad12c6d530a6517f40c658677 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index 6af8388d9eba4e4ea8fbb833f84a5c06e182b1f2..f7c006dbcb1a9a23ec619c8d790df8a093530eee 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/graph_send_recv_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,59 +24,6 @@ class GraphSendRecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index", - "GraphSendRecv"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv"); - - auto src_index_dims = ctx->GetInputDim("Src_index"); - if (src_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(src_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Src_index should be 1 when it " - "is 2D, but we get %d", - src_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - src_index_dims.size(), 1, - platform::errors::InvalidArgument( - "The Src_index should be 1D, when it is not 2D, but we get %d", - src_index_dims.size())); - } - - auto dst_index_dims = ctx->GetInputDim("Dst_index"); - if (dst_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(dst_index_dims[1], 1, - platform::errors::InvalidArgument( - "The last dim of Dst_index should be 1 when it " - "is 2D, but we get %d", - dst_index_dims[1])); - } else { - PADDLE_ENFORCE_EQ( - dst_index_dims.size(), 1, - platform::errors::InvalidArgument("The Dst_index should be 1D, " - "when it is not 2D, but we get %d", - dst_index_dims.size())); - } - - PADDLE_ENFORCE_EQ( - src_index_dims[0], dst_index_dims[0], - platform::errors::InvalidArgument( - "Src_index and Dst_index should have the same shape.")); - - auto dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pool_type") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count", - "GraphSendRecv"); - ctx->SetOutputDim("Dst_count", {dims[0]}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -164,20 +114,12 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; +DECLARE_INFER_SHAPE_FUNCTOR(graph_send_recv, GraphSendRecvInferShapeFunctor, + PD_INFER_META(phi::GraphSendRecvInferMeta)); REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, ops::GraphSendRecvOpMaker, ops::GraphSendRecvGradOpMaker, - ops::GraphSendRecvGradOpMaker); + ops::GraphSendRecvGradOpMaker, + GraphSendRecvInferShapeFunctor); REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); -REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel, - ops::GraphSendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel, - ops::GraphSendRecvGradOpKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu deleted file mode 100644 index f43d31814ac38430d2d473eeca548b63e1a5c1fa..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.cu +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/graph_send_recv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMaxCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); - } -}; - -template -struct GraphSendRecvMinCUDAFunctor { - DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, - const IndexT& out_i) { - paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); - } -}; - -template -__global__ void GraphSendRecvCUDAKernel(const T* params, - const IndexT* src_indices, - const IndexT* dst_indices, T* output, - size_t index_size, size_t slice_size, - Functor functor) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - functor(params, output, in_i, out_i); - } -} - -// For max -template -__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min()) { - *(output + i) = 0; - } - } -} - -// For min -template -__global__ void InputResetMinCUDAKernel(T* output, size_t input_size, - size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::max()) { - *(output + i) = 0; - } - } -} - -// Get dst_count -template -__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices, - size_t index_size) { - CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { - IndexT dst_i = dst_indices[i]; - paddle::platform::CudaAtomicAdd(count + dst_i, 1); - } -} - -// For forward mean -template -__global__ void ManipulateMeanCUDAKernel(T* output, int* count, - size_t input_size, size_t slice_size) { - CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - int64_t c_index = i / slice_size; - if (*(count + c_index) > 1) { - *(output + i) = *(output + i) / *(count + c_index); - } - } -} - -// For backward mean -template -__global__ void ManipulateMeanGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const int* dst_count) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / dst_count[src_i]); - } -} - -// For backward min and max -template -__global__ void ManipulateMinMaxGradCUDAKernel( - const T* params, const IndexT* src_indices, const IndexT* dst_indices, - T* output, size_t index_size, size_t slice_size, const T* ptr_input, - const T* ptr_output) { - CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { - int64_t indices_i = i / slice_size; - int64_t slice_i = i - indices_i * slice_size; - IndexT src_i = src_indices[indices_i]; - IndexT dst_i = dst_indices[indices_i]; - int64_t in_i = src_i * slice_size + slice_i; - int64_t out_i = dst_i * slice_size + slice_i; - paddle::platform::CudaAtomicAdd( - output + out_i, - *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); - } -} - -template -void GraphSendRecvOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input("X"); - auto* Y = ctx.Output("Out"); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MAX") { - GraphSendRecvMaxCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_max = - grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; - InputResetMaxCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - GraphSendRecvMinCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_min = - grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; - InputResetMinCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_dst_count, 0, input_size * sizeof(int)); -#else - cudaMemset(p_dst_count, 0, input_size * sizeof(int)); -#endif - - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(p_dst_count, d_index, index_size); - - int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; - int64_t grid_mean = - grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, p_dst_count, input_size, slice_size); - } -} - -template -void GraphSendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index, - const Tensor& dst_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* Y = ctx.Output(framework::GradVarName("X")); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - - if (index_size == 0) return; - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index.data(); - -#ifdef PADDLE_WITH_HIP - int block = 256; -#else - int block = 1024; -#endif - int64_t n = slice_size * index_size; - const auto& dev_ctx = ctx.cuda_device_context(); - int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize()[0]; - int64_t grid_tmp = (n + block - 1) / block; - int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - GraphSendRecvSumCUDAFunctor functor; - GraphSendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - ManipulateMeanGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); - const T* ptr_input = input->data(); - const T* ptr_output = output->data(); - ManipulateMinMaxGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, ptr_input, - ptr_output); - } -} - -template -class GraphSendRecvOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index, *dst_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index dtype, expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL(graph_send_recv, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel, - ops::GraphSendRecvOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel, - ops::GraphSendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/graph_send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h deleted file mode 100644 index 8d8111e0ee845bf6828ee53459e6d86bdebba484..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/graph_send_recv_op.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct GraphSendRecvSumFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - eigen_dst += eigen_src; - } -}; - -template -struct GraphSendRecvMinFunctor { - void operator()(const bool& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMin(eigen_src); - } - } -}; - -template -struct GraphSendRecvMaxFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, - Tensor* dst_slice) { - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag) { - eigen_dst += eigen_src; - } else { - eigen_dst = eigen_dst.cwiseMax(eigen_src); - } - } -}; - -template -void elementwise_inner_operation(const Tensor& src, Tensor* dst, - const IndexT& src_index, - const IndexT& dst_index, - const bool& first_flag, Functor functor) { - auto src_slice = src.Slice(src_index, src_index + 1); - auto dst_slice = dst->Slice(dst_index, dst_index + 1); - - functor(first_flag, src_slice, &dst_slice); -} - -template -void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* dst_count = nullptr) { - Functor functor; - if (pool_type == "SUM") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - for (int i = 0; i < index_size; ++i) { - IndexT dst_idx = d_index[i]; - *(dst_count + dst_idx) += 1; - } - for (int i = 0; i < input_size; ++i) { - if (*(dst_count + i) == 0) continue; - auto dst_slice = dst->Slice(i, i + 1); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(*(dst_count + i)); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } - } - } -} - -template -void graph_send_recv_cpu_for_loop_grad( - const int& input_size, const int& index_size, const IndexT* s_index, - const IndexT* d_index, const Tensor& src, Tensor* dst, - const std::string& pool_type, const int* dst_count = nullptr, - const Tensor* input = nullptr, const Tensor* output = nullptr) { - if (pool_type == "SUM") { - Functor functor; - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - elementwise_inner_operation(src, dst, src_idx, - dst_idx, false, functor); - } - } else if (pool_type == "MEAN") { - for (int i = 0; i < index_size; ++i) { - const IndexT& src_idx = s_index[i]; - const IndexT& dst_idx = d_index[i]; - auto src_slice = src.Slice(src_idx, src_idx + 1); - auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); - } - } else if (pool_type == "MIN" || pool_type == "MAX") { - for (int i = 0; i < index_size; ++i) { - const IndexT& forward_src_idx = d_index[i]; - const IndexT& forward_dst_idx = s_index[i]; - auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); - auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); - auto eigen_input = framework::EigenVector::Flatten(input_slice); - auto eigen_output = framework::EigenVector::Flatten(output_slice); - - auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); - auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); - auto eigen_src = framework::EigenVector::Flatten(src_slice); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += eigen_src * (eigen_output == eigen_input); - } - } -} - -template -void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { - auto* X = ctx.Input("X"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - graph_send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, - p_dst_count); - } -} - -template -void GraphSendRecvGradOpKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index) { - auto* X = ctx.Input(framework::GradVarName("Out")); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - - const int& index_size = src_index.dims()[0]; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - if (index_size == 0) return; - - const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); - - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { - const auto* input = ctx.Input("X"); - const auto* output = ctx.Input("Out"); - // Functor not used here. - graph_send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, - input, output); - } -} - -template -class GraphSendRecvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Src_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvOpKernelLaunchHelper(ctx, *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvOpKernelLaunchHelper(ctx, - *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -template -class GraphSendRecvGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* src_index = ctx.Input("Dst_index"); - auto index_type = framework::TransToProtoVarType(src_index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - GraphSendRecvGradOpKernelLaunchHelper(ctx, - *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - GraphSendRecvGradOpKernelLaunchHelper( - ctx, *src_index); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " - "got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index 09f4e63943ad3784a598524273831bf875ed9213..8324a6215bca8145ba36dabb3d8108006a57e829 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/inverse_op.h b/paddle/fluid/operators/inverse_op.h index 1e061d8b50ae02f9b87f0a0976543467aa0b7dd0..31c22915ec5d052eb11c613d476f6aea541d8c47 100644 --- a/paddle/fluid/operators/inverse_op.h +++ b/paddle/fluid/operators/inverse_op.h @@ -15,8 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/matrix_inverse.h" namespace paddle { namespace operators { @@ -30,7 +30,7 @@ class InverseKernel : public framework::OpKernel { output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - math::MatrixInverseFunctor mat_inv; + phi::funcs::MatrixInverseFunctor mat_inv; mat_inv(dev_ctx, *input, output); } }; diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc index 735fffa7203b1213fccec0c4098048e85a6b24f8..cfa370ff9cb19dfb7d488b03cba52c115083cdc8 100644 --- a/paddle/fluid/operators/isfinite_v2_op.cc +++ b/paddle/fluid/operators/isfinite_v2_op.cc @@ -12,11 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/isfinite_v2_op.h" - #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -49,11 +51,6 @@ class OverflowV2Op : public framework::OperatorWithKernel { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "isfinitev2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "isfinitev2"); - UnaryOpUnchangedInferShape(ctx); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -104,6 +101,14 @@ element of X as a tensor. } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(isinf_v2, IsinfInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isnan_v2, IsnanInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); + +DECLARE_INFER_SHAPE_FUNCTOR(isfinite_v2, IsfiniteInferShapeFunctor, + PD_INFER_META(phi::IsfiniteInferMeta)); #define REGISTER_V2OP_MAKER(op_type, comment) \ namespace paddle { \ @@ -124,50 +129,17 @@ REGISTER_V2OP_MAKER(isfinite_v2, "isfinitev2(X)"); REGISTER_OPERATOR( isinf_v2, ops::OverflowV2Op, ops::_isinf_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsinfInferShapeFunctor); REGISTER_OPERATOR( isnan_v2, ops::OverflowV2Op, ops::_isnan_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + IsnanInferShapeFunctor); REGISTER_OPERATOR( isfinite_v2, ops::OverflowV2Op, ops::_isfinite_v2OverflowV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CPU_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); + paddle::framework::EmptyGradOpMaker, + IsfiniteInferShapeFunctor); diff --git a/paddle/fluid/operators/isfinite_v2_op.cu b/paddle/fluid/operators/isfinite_v2_op.cu deleted file mode 100644 index 1b9f19d36dfa0f590f96577295ffb12e4456d2e5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/isfinite_v2_op.cu +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/isfinite_v2_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(isnan_v2, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isinf_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); - -REGISTER_OP_CUDA_KERNEL( - isfinite_v2, ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel, - ops::OverflowKernel); diff --git a/paddle/fluid/operators/kthvalue_op.cu b/paddle/fluid/operators/kthvalue_op.cu index 4f30c58d375008abb3509989f90bcd9fec91fb38..f6f56f70f1a11971b31e679ef879f2d1d0a96085 100644 --- a/paddle/fluid/operators/kthvalue_op.cu +++ b/paddle/fluid/operators/kthvalue_op.cu @@ -16,7 +16,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/kthvalue_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" #ifdef __NVCC__ #include "cub/cub.cuh" #endif diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc index fe271fa5e893a750bdbbdc05ac4b7835205ebe66..378c7573d6129abc28bd53dd6f964e5c726cce34 100644 --- a/paddle/fluid/operators/linspace_op.cc +++ b/paddle/fluid/operators/linspace_op.cc @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/linspace_op.h" #include + +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,33 +27,6 @@ class LinspaceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Start"), "Input", "Start", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Stop"), "Input", "Stop", "linspace"); - OP_INOUT_CHECK(ctx->HasInput("Num"), "Input", "Num", "linspace"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "linspace"); - - auto s_dims = ctx->GetInputDim("Start"); - PADDLE_ENFORCE_EQ((s_dims.size() == 1) && (s_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Start) must be [1]," - "but received input shape is [%s].", - s_dims)); - auto e_dims = ctx->GetInputDim("Stop"); - PADDLE_ENFORCE_EQ((e_dims.size() == 1) && (e_dims[0] == 1), true, - platform::errors::InvalidArgument( - "The shape of Input(Stop) must be [1]," - "but received input shape is [%s].", - e_dims)); - auto step_dims = ctx->GetInputDim("Num"); - PADDLE_ENFORCE_EQ( - (step_dims.size() == 1) && (step_dims[0] == 1), true, - platform::errors::InvalidArgument("The shape of Input(Num) must be [1]," - "but received input shape is [%s].", - step_dims)); - ctx->SetOutputDim("Out", {-1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -88,11 +65,13 @@ class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker); -REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel, - ops::CPULinspaceKernel); +DECLARE_INFER_SHAPE_FUNCTOR(linspace, LinspaceInferShapeFunctor, + PD_INFER_META(phi::LinspaceInferMeta)); +REGISTER_OPERATOR( + linspace, ops::LinspaceOp, ops::LinspaceOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + LinspaceInferShapeFunctor); REGISTER_OP_VERSION(linspace) .AddCheckpoint( diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu deleted file mode 100644 index aa625a7f5b9df0aa76872c56a3769f1186125bf5..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/linspace_op.cu +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/linspace_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void LinspaceKernel(T start, T stop, double step, int64_t size, - T* out) { - int64_t index = blockIdx.x * blockDim.x + threadIdx.x; - - for (; index < size; index += blockDim.x * gridDim.x) { - if (index < size / 2) { - out[index] = static_cast(start + step * index); - } else { - out[index] = static_cast(stop - step * (size - index - 1)); - } - } -} - -template -__global__ void LinspaceSpecialKernel(T start, T* out) { - out[0] = static_cast(start); -} - -template -class CUDALinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - auto* num_t = context.Input("Num"); - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - framework::Tensor n_start; - framework::Tensor n_stop; - framework::Tensor n_num; - framework::TensorCopy(start_t, platform::CPUPlace(), &n_start); - T start = n_start.data()[0]; - framework::TensorCopy(stop_t, platform::CPUPlace(), &n_stop); - T stop = n_stop.data()[0]; - framework::TensorCopy(*num_t, platform::CPUPlace(), &n_num); - int64_t num = static_cast(n_num.data()[0]); - - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - T* out_data = out->mutable_data(context.GetPlace()); - - double step = 0; - auto stream = context.cuda_device_context().stream(); - int block = 512; - int grid = (num + block - 1) / block; - if (num != 1) { - step = (static_cast(stop - start)) / (num - 1); - LinspaceKernel<<>>(start, stop, step, num, - out_data); - } else { - LinspaceSpecialKernel<<>>(start, out_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel, - ops::CUDALinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h deleted file mode 100644 index ae51f1221cc09b433e784ecaf52da69e41fc3706..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/linspace_op.h +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class CPULinspaceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* pre_start = context.Input("Start"); - auto* pre_stop = context.Input("Stop"); - int32_t num = context.Input("Num")->data()[0]; - auto* out = context.Output("Out"); - auto dtype = static_cast( - context.Attr("dtype")); - - Tensor start_t; - Tensor stop_t; - auto start_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_start->dtype()), context.GetPlace()); - auto stop_dtype = framework::OpKernelType( - framework::TransToProtoVarType(pre_stop->dtype()), context.GetPlace()); - auto out_dtype = framework::OpKernelType(dtype, context.GetPlace()); - framework::TransDataType(start_dtype, out_dtype, *pre_start, &start_t); - framework::TransDataType(stop_dtype, out_dtype, *pre_stop, &stop_t); - - T start = start_t.data()[0]; - T stop = stop_t.data()[0]; - PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( - "The num of linspace op should be larger " - "than 0, but received num is %d", - num)); - - out->Resize(phi::make_ddim({num})); - - T* out_data = out->mutable_data(context.GetPlace()); - - if (num > 1) { - // step should be of double type for all types - double step = (static_cast(stop - start)) / (num - 1); - int half_num = num / 2; - for (int i = 0; i < num; ++i) { - if (i < half_num) { - out_data[i] = static_cast(start + step * i); - } else { - out_data[i] = static_cast(stop - step * (num - i - 1)); - } - } - } else { - out_data[0] = static_cast(start); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 2e596ff3e625735e5ff644560c9866f4f15a044a..883e3597d8a31138a6ff1e4cfcb05a165eafc4a6 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -21,43 +24,6 @@ namespace operators { class LogLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predicted"), "Input", "Predicted", "LogLoss"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLoss"); - - auto pred_dims = ctx->GetInputDim("Predicted"); - auto label_dims = ctx->GetInputDim("Labels"); - - if (ctx->IsRuntime() || - (phi::product(pred_dims) > 0 && phi::product(label_dims) > 0)) { - PADDLE_ENFORCE_EQ( - pred_dims, label_dims, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be equal to the" - "dimensions of Input(Labels), but received dimensions of " - "Input(Predicted)" - "is [%s], received dimensions of Input(Labels) is [%s].", - pred_dims, label_dims)); - } - PADDLE_ENFORCE_EQ(pred_dims.size(), 2, - platform::errors::InvalidArgument( - "The dimensions of Input(Predicted) must be 2," - "But received dimensions of Input(Predicted)" - "is [%d]", - pred_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - pred_dims[1], 1, - platform::errors::InvalidArgument( - "Each row of Input(Predicted) contains a real value, " - "so the 2nd dimension of Input(X) must be 1," - "But got [%d]", - pred_dims[1])); - } - ctx->SetOutputDim("Loss", {pred_dims[0], 1}); - ctx->ShareLoD("Predicted", "Loss"); - } }; template @@ -145,7 +111,10 @@ class LogLossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(log_loss, LogLossInferShapeFunctor, + PD_INFER_META(phi::LogLossInferMeta)); REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, ops::LogLossGradMaker, - ops::LogLossGradMaker); + ops::LogLossGradMaker, + LogLossInferShapeFunctor); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index bce927c32ddf7e9c78f7c2ba1be50e6929426d4f..31a98d9f630e1c01f3b886cbe91dd3882b384d05 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -46,8 +46,6 @@ math_library(vol2col) math_library(prelu) math_library(bert_encoder_functor) math_library(tree2col DEPS math_function) -math_library(matrix_inverse) -math_library(segment_pooling) math_library(matrix_solve) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc deleted file mode 100644 index 1b36e615c68df814015a2c308ed74b755f6bc635..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cc +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "Eigen/Core" -#include "Eigen/LU" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { - compute_inverse_eigen(context, a, a_inv); - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc deleted file mode 100644 index 41335a69417a94a567119bb8f37378af957be541..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace platform { -class CUDADeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace math { - -template -class MatrixInverseFunctor; - -template -class MatrixInverseFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& a, framework::Tensor* a_inv) { -#ifndef PADDLE_WITH_HIP - const auto& mat_dims = a.dims(); - const int rank = mat_dims.size(); - int n = mat_dims[rank - 1]; - int batch_size = rank > 2 ? a.numel() / (n * n) : 1; - - memory::allocation::AllocationPtr tmp_gpu_mat_data; - const T* gpu_mat = a.data(); - if (n >= 32) { - // Copy all elements of input matrix A to a temporary memory space to - // avoid being overriden by getrf. - tmp_gpu_mat_data = memory::Alloc(context, a.numel() * sizeof(T)); - memory::Copy(context.GetPlace(), tmp_gpu_mat_data->ptr(), - context.GetPlace(), a.data(), a.numel() * sizeof(T), - context.stream()); - gpu_mat = reinterpret_cast(tmp_gpu_mat_data->ptr()); - } - - std::vector cpu_ptrs(batch_size * 2); - for (int i = 0; i < batch_size; ++i) { - cpu_ptrs[i] = gpu_mat + i * n * n; - cpu_ptrs[i + batch_size] = a_inv->data() + i * n * n; - } - - // Copy the addresses of A and A_inv from host to device. - memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); - memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), - platform::CPUPlace(), static_cast(cpu_ptrs.data()), - cpu_ptrs.size() * sizeof(T*), context.stream()); - T** gpu_inv_ptrs = - reinterpret_cast(tmp_gpu_ptrs_data->ptr()) + batch_size; - - // Allocate device memory for info and pivots. - int num_ints = n < 32 ? batch_size : batch_size * (n + 1); - memory::allocation::AllocationPtr tmp_gpu_info_data = - memory::Alloc(context, num_ints * sizeof(int)); - int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); - - auto blas = phi::funcs::GetBlas(context); - - std::vector info; // only for singular checking - info.resize(batch_size); - // This functions in cuBLAS is intended to be used for matrices of small - // sizes where the launch overhead is a significant factor. - // TODO(Xreki): call function in cusolver for large matrices. - if (n < 32) { - // cublasmatinvBatched is a short cut of cublasgetrfBatched - // plus cublasgetriBatched. - // However it only works if N is less than 32. If not, we need to - // go through cublasgetrfBatched and cublasgetriBatched. - blas.BatchedMatInv(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_inv_ptrs, gpu_info_ptr, batch_size); - } else { - // This function performs the LU factorization of each matrix A by the - // equation P * A = L * U. L and U are written back to original matrix A, - // and diagonal elements of L are discarded. - int* gpu_pivot_ptr = - reinterpret_cast(tmp_gpu_info_data->ptr()) + batch_size; - blas.BatchedGETRF(n, reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_info_ptr, batch_size); - - blas.BatchedGETRI(n, - reinterpret_cast(tmp_gpu_ptrs_data->ptr()), - gpu_pivot_ptr, gpu_inv_ptrs, gpu_info_ptr, batch_size); - } - memory::Copy(platform::CPUPlace(), info.data(), context.GetPlace(), - gpu_info_ptr, sizeof(int) * batch_size, context.stream()); - for (int i = 0; i < batch_size; ++i) { - PADDLE_ENFORCE_EQ(info[i], 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U. " - "Please check the matrix value and change it to a " - "non-singular matrix", - i, info[i], info[i])); - } -#else - compute_inverse_eigen(context, a, a_inv); -#endif - } -}; - -template class MatrixInverseFunctor; -template class MatrixInverseFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc index 45556e97d1d7afb81d626c99b078cbc215c0195f..28ec3a871022f4b9ec4dce9d9310fd630f10e473 100644 --- a/paddle/fluid/operators/math/maxouting.cc +++ b/paddle/fluid/operators/math/maxouting.cc @@ -14,106 +14,107 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" +#include "paddle/phi/backends/cpu/cpu_context.h" + namespace paddle { namespace operators { namespace math { // All tensors are in NCHW or NHWC format, and the groups must be greater than 1 -template -class MaxOutFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - int fea_size = input_height * input_width; - // c_size means the output size of each sample - int c_size = fea_size * output_channels; - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int new_bindex = c_size * i; - for (int c = 0; c < output_channels; ++c) { - int new_cindex = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - T ele = static_cast(-FLT_MAX); - int input_idx, output_idx; - for (int ph = 0; ph < groups; ++ph) { - if (axis == 1) { - input_idx = - (new_bindex + new_cindex) * groups + ph * fea_size + f; - } else { - input_idx = (new_bindex + f * output_channels + c) * groups + ph; - } - T x = input_data[input_idx]; - ele = ele > x ? ele : x; - } +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + int fea_size = input_height * input_width; + // c_size means the output size of each sample + int c_size = fea_size * output_channels; + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + int new_bindex = c_size * i; + for (int c = 0; c < output_channels; ++c) { + int new_cindex = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + T ele = static_cast(-FLT_MAX); + int input_idx, output_idx; + for (int ph = 0; ph < groups; ++ph) { if (axis == 1) { - output_idx = new_bindex + new_cindex + f; + input_idx = (new_bindex + new_cindex) * groups + ph * fea_size + f; } else { - output_idx = new_bindex + f * output_channels + c; + input_idx = (new_bindex + f * output_channels + c) * groups + ph; } - output_data[output_idx] = ele; + T x = input_data[input_idx]; + ele = ele > x ? ele : x; } + if (axis == 1) { + output_idx = new_bindex + new_cindex + f; + } else { + output_idx = new_bindex + f * output_channels + c; + } + output_data[output_idx] = ele; } } } -}; +} -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - int fea_size = input_height * input_width; - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + int fea_size = input_height * input_width; + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - for (int i = 0; i < batch_size; ++i) { - int blen = fea_size * output_channels * i; - for (int c = 0; c < output_channels; ++c) { - int clen = fea_size * c; - for (int f = 0; f < fea_size; ++f) { - int input_idx0, output_idx; - bool continue_match = true; - if (axis == 1) { - input_idx0 = (blen + clen) * groups + f; - output_idx = blen + clen + f; - } else { - input_idx0 = (blen + f * output_channels + c) * groups; - output_idx = blen + f * output_channels + c; - } - for (int g = 0; g < groups && continue_match; ++g) { - int idx_offset = (axis == 1 ? fea_size * g : g); - int input_idx = input_idx0 + idx_offset; - if (input_data[input_idx] == output_data[output_idx]) { - input_grad_data[input_idx] += output_grad_data[output_idx]; - continue_match = false; - } + for (int i = 0; i < batch_size; ++i) { + int blen = fea_size * output_channels * i; + for (int c = 0; c < output_channels; ++c) { + int clen = fea_size * c; + for (int f = 0; f < fea_size; ++f) { + int input_idx0, output_idx; + bool continue_match = true; + if (axis == 1) { + input_idx0 = (blen + clen) * groups + f; + output_idx = blen + clen + f; + } else { + input_idx0 = (blen + f * output_channels + c) * groups; + output_idx = blen + f * output_channels + c; + } + for (int g = 0; g < groups && continue_match; ++g) { + int idx_offset = (axis == 1 ? fea_size * g : g); + int input_idx = input_idx0 + idx_offset; + if (input_data[input_idx] == output_data[output_idx]) { + input_grad_data[input_idx] += output_grad_data[output_idx]; + continue_match = false; } } } } } -}; +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index 1856fb4eb48c73f96d4f6428ba890c821a61292c..1d0478db5ef4a80d955d1166ffa21ff39f6bd184 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_context.h" namespace paddle { namespace operators { @@ -95,61 +96,57 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - const int groups, const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output->dims()[axis]; - - const T* input_data = input.data(); - T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = output->numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxOut<<>>( - nthreads, input_data, input_channels, input_height, input_width, groups, - axis, output_data); - } -}; +template +void MaxOutFunctor::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* output, + const int groups, + const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output->dims()[axis]; + + const T* input_data = input.data(); + T* output_data = output->mutable_data(context.GetPlace()); + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + axis, output_data); +} + /* * All tensors are in NCHW or NHWC format. */ -template -class MaxOutGradFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, - const framework::Tensor& output, - const framework::Tensor& output_grad, const int groups, - const int axis) { - const int batch_size = input.dims()[0]; - const int input_channels = input.dims()[axis]; - const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); - const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); - const int output_channels = output.dims()[axis]; - - const T* input_data = input.data(); - const T* output_data = output.data(); - const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = output.numel(); - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelMaxoutGrad<<>>( - nthreads, input_data, output_data, output_grad_data, input_grad_data, - input_channels, input_height, input_width, groups, axis); - } -}; +template +void MaxOutGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, + const framework::Tensor& output_grad, const int groups, const int axis) { + const int batch_size = input.dims()[0]; + const int input_channels = input.dims()[axis]; + const int input_height = (axis == 1 ? input.dims()[2] : input.dims()[1]); + const int input_width = (axis == 1 ? input.dims()[3] : input.dims()[2]); + const int output_channels = output.dims()[axis]; + + const T* input_data = input.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups, axis); +} template class MaxOutGradFunctor; template class MaxOutGradFunctor; @@ -157,6 +154,12 @@ template class MaxOutGradFunctor; template class MaxOutFunctor; template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; + +template class MaxOutFunctor; +template class MaxOutFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/maxouting.h b/paddle/fluid/operators/math/maxouting.h index 0d8372df8a2fec306f6091712c66d55d1e71216e..1f4964f7715426d2eab6168ae009ffbd40e1ff0a 100644 --- a/paddle/fluid/operators/math/maxouting.h +++ b/paddle/fluid/operators/math/maxouting.h @@ -30,7 +30,7 @@ class MaxOutFunctor { const int axis = 1); }; -template +template class MaxOutGradFunctor { public: void operator()(const DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index 1524a50f1ac6d6afa67722bc5d1c16a581395bb2..87df75ac465042a0f7894abecb4be4c213e5d807 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -38,7 +38,7 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, auto mat_dim_b = phi::funcs::CreateMatrixDescriptor( ColumnMatrixFromVector(y_dims), 0, trans_y); - if (x_dims.size() == 3 && y_dims.size() <= 2) { + if (x_dims.size() >= 3 && y_dims.size() <= 2) { // if transpose_X is true, the transpose cost much time if (!trans_x) { mat_dim_a.height_ *= mat_dim_a.batch_size_; diff --git a/paddle/fluid/operators/matrix_power_op.cc b/paddle/fluid/operators/matrix_power_op.cc index c65af3129f3646163925be95b27b9fec25207f8c..cdf204628b638f877c92e35a8941487aa39b5427 100644 --- a/paddle/fluid/operators/matrix_power_op.cc +++ b/paddle/fluid/operators/matrix_power_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/matrix_power_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" namespace paddle { namespace operators { @@ -119,13 +122,3 @@ REGISTER_OPERATOR(matrix_power, ops::MatrixPowerOp, ops::MatrixPowerOpMaker, ops::MatrixPowerGradOpMaker); REGISTER_OPERATOR(matrix_power_grad, ops::MatrixPowerGradOp); - -REGISTER_OP_CPU_KERNEL( - matrix_power, - ops::MatrixPowerKernel, - ops::MatrixPowerKernel); - -REGISTER_OP_CPU_KERNEL( - matrix_power_grad, - ops::MatrixPowerGradKernel, - ops::MatrixPowerGradKernel); diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h deleted file mode 100644 index d2c67d80b4f5a562d47e56173ecf1ea2f99bff56..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/matrix_power_op.h +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/matrix_inverse.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -struct IdentityMatrixFunctor { - IdentityMatrixFunctor(const int m, T* output) : m_(m), output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int row = index / m_ % m_; - const int col = index % m_; - output_[index] = col == row ? static_cast(1) : static_cast(0); - } - - const int m_; - T* output_; -}; - -template -void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, - const paddle::framework::ExecutionContext& ctx) { - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - T* out_data = Out->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, X->numel()); - - if (n == 0) { - // Out = Identity Matrix - IdentityMatrixFunctor functor(x_dims[x_ndim - 1], out_data); - for_range(functor); - return; - } - - auto blas = phi::funcs::GetBlas(dev_ctx); - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - if (new_n == 1) { - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, Out); - return; - } - - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (new_n == 2) { - // Out = newX * newX - Out->mutable_data(ctx.GetPlace()); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 3) { - // Out = (newX * newX) * newX - // Note: C[i] matrices in MatMul must not overlap, i.e. the individual - // gemm operations must be computable independently; otherwise, - // undefined behavior is expected. - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, new_x, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } else if (new_n == 4) { - // Out = (newX * newX) * (newX * newX) - Tensor temp = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, no_trans_desc, new_x, no_trans_desc, static_cast(1), - &temp, static_cast(0)); - blas.MatMul(temp, no_trans_desc, temp, no_trans_desc, static_cast(1), - Out, static_cast(0)); - return; - } - - // Calculate Out = newX^{n} for abs(n) > 4 with time complexity as O(logN) - int bit = 0; - Tensor z = Tensor(X->dtype()); - bool out_inited = false; - Tensor temp_out = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor temp_z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - while (new_n > 0) { - bit = new_n & 0x1; - new_n >>= 1; - if (z.IsInitialized()) { - blas.MatMul(z, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_z, static_cast(0)); - framework::TensorCopy(temp_z, ctx.GetPlace(), dev_ctx, &z); - } else { - z = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - framework::TensorCopy(new_x, ctx.GetPlace(), dev_ctx, &z); - } - if (bit == 1) { - if (out_inited == true) { - blas.MatMul(*Out, no_trans_desc, z, no_trans_desc, static_cast(1), - &temp_out, static_cast(0)); - framework::TensorCopy(temp_out, ctx.GetPlace(), dev_ctx, Out); - } else { - framework::TensorCopy(z, ctx.GetPlace(), dev_ctx, Out); - out_inited = true; - } - } - } - return; -} - -template -class MatrixPowerKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - Tensor* Out = ctx.Output("Out"); - int n = ctx.Attr("n"); - - const auto& x_dims = X->dims(); - const int x_ndim = x_dims.size(); - PADDLE_ENFORCE_EQ( - x_dims[x_ndim - 2], x_dims[x_ndim - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) should be equal." - "X's shape[-2] = %d and shape[-1] = %d.", - x_dims[x_ndim - 2], x_dims[x_ndim - 1])); - - MatrixPowerFunction(X, n, Out, ctx); - } -}; - -template -void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, - const Tensor* dOut, const int n, Tensor* dX, - const paddle::framework::ExecutionContext& ctx) { - dX->mutable_data(ctx.GetPlace()); - const auto& x_dims = X->dims(); - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - if (n == 0) { - // \nabla X = O - phi::funcs::SetConstant zero; - zero(dev_ctx, dX, static_cast(0)); - return; - } else if (n == 1) { - // \nabla X = \nabla Out - framework::TensorCopy(*dOut, ctx.GetPlace(), dev_ctx, dX); - return; - } - - auto trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, true); - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(x_dims, 0, false); - - if (n == -1) { - // \nabla X = Out^{T} * \nabla Out * Out^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*Out, trans_desc, *dOut, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, *Out, trans_desc, static_cast(1), dX, - static_cast(0)); - return; - } - - Tensor new_x = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - int new_n = n; - if (n > 0) { - // newX = X - framework::TensorCopy(*X, ctx.GetPlace(), dev_ctx, &new_x); - } else { - // newX = X^{-1}, n = -n - math::MatrixInverseFunctor mat_inv; - mat_inv(dev_ctx, *X, &new_x); - new_n = -n; - } - - // Use chain rule blow to compute \nabla newX^{n} - // First, Get newX^{0}, newX^{1}, ..., newX^{n - 1}, - // Note that newX^{0} can be omitted - std::vector> tensor_list(new_n - 1); - tensor_list[0] = std::make_shared(new_x); - int index = 1; - while (index < new_n - 1) { - tensor_list[index] = std::make_shared( - ctx.AllocateTmpTensor(X->dims(), dev_ctx)); - blas.MatMul(*tensor_list[index - 1], no_trans_desc, new_x, no_trans_desc, - static_cast(1), tensor_list[index].get(), static_cast(0)); - index++; - } - - // Second, \nabla newX = \sum_{i = 0}^{n - 1} (newX^{T}^{i} - // * \nabla Out - // * (newX^{T}^{n - i - 1}) - Tensor dx_new = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[new_n - 2], trans_desc, *dOut, no_trans_desc, - static_cast(1), &dx_new, static_cast(0)); - Tensor da_an_minus1 = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*dOut, no_trans_desc, *tensor_list[new_n - 2], trans_desc, - static_cast(1), &da_an_minus1, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), da_an_minus1.data(), - dx_new.data()); - int start = 0; - while (start < new_n - 2) { - Tensor a_da = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - Tensor a_da_a = ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(*tensor_list[start], trans_desc, *dOut, no_trans_desc, - static_cast(1), &a_da, static_cast(0)); - blas.MatMul(a_da, no_trans_desc, *tensor_list[new_n - 3 - start], - trans_desc, static_cast(1), &a_da_a, static_cast(0)); - blas.AXPY(X->numel(), static_cast(1), a_da_a.data(), - dx_new.data()); - start++; - } - - if (n > 0) { - // \nabla X = \nabla newX - framework::TensorCopy(dx_new, ctx.GetPlace(), dev_ctx, dX); - } else { - // \nabla X = newX^{T} * \nabla newX * newX^{T} - Tensor temp_dx = - ctx.AllocateTmpTensor(X->dims(), dev_ctx); - blas.MatMul(new_x, trans_desc, dx_new, no_trans_desc, static_cast(-1), - &temp_dx, static_cast(0)); - blas.MatMul(temp_dx, no_trans_desc, new_x, trans_desc, static_cast(1), - dX, static_cast(0)); - } - return; -} - -template -class MatrixPowerGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* X = ctx.Input("X"); - const Tensor* Out = ctx.Input("Out"); - const Tensor* dOut = ctx.Input(framework::GradVarName("Out")); - const int n = ctx.Attr("n"); - Tensor* dX = ctx.Output(framework::GradVarName("X")); - - MatrixPowerGradFunction(X, Out, dOut, n, dX, ctx); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/maxout_op.cc b/paddle/fluid/operators/maxout_op.cc index bd9ebd29777def2fafca648ad80bc57bef8df316..e55369e0691ee5e36da76c53c6dd5d13288231f4 100644 --- a/paddle/fluid/operators/maxout_op.cc +++ b/paddle/fluid/operators/maxout_op.cc @@ -12,14 +12,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "paddle/fluid/operators/maxout_op.h" #include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { -using framework::Tensor; - class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -130,10 +130,3 @@ REGISTER_OPERATOR( paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker); REGISTER_OPERATOR(maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL( - maxout, ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_CPU_KERNEL( - maxout_grad, - ops::MaxOutGradKernel, - ops::MaxOutGradKernel); diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h deleted file mode 100644 index 922998293943ed5ee1ebcd08b5bcd93467496cb9..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/maxout_op.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MaxOutKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - Tensor* out = context.Output("Out"); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - math::MaxOutFunctor maxout_forward; - maxout_forward(context.template device_context(), *in_x, out, - groups, axis); - } -}; - -template -class MaxOutGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - int groups = context.template Attr("groups"); - int axis = context.template Attr("axis"); - if (axis < 0) { - axis += in_x->dims().size(); - } - - auto& device_ctx = context.template device_context(); - phi::funcs::SetConstant zero; - if (in_x_grad) { - in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups, - axis); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc index 3692ace8bb5a46b06bd10a07a5d5d95d8825bdc6..32ef052119883944abc1876f8bf3a8c028ddc57a 100644 --- a/paddle/fluid/operators/metrics/accuracy_op.cc +++ b/paddle/fluid/operators/metrics/accuracy_op.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -21,69 +23,6 @@ class AccuracyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("Out"), true, - platform::errors::NotFound("Input (Out) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Indices"), true, - platform::errors::NotFound( - "Input (Indices) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true, - platform::errors::NotFound( - "Input (Label) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Accuracy"), true, - platform::errors::NotFound( - "Output (Accuracy) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Correct"), true, - platform::errors::NotFound( - "Output (Correct) of AccuracyOp is not found.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Total"), true, - platform::errors::NotFound( - "Output (Total) of AccuracyOp is not found.")); - - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Indices"), "Input", "Indices", "Accuracy"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Accuracy"), "Output", "Accuracy", - "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Correct"), "Output", "Correct", "Accuracy"); - OP_INOUT_CHECK(ctx->HasOutput("Total"), "Output", "Total", "Accuracy"); - - auto inference_dim = ctx->GetInputDim("Out"); - auto label_dim = ctx->GetInputDim("Label"); - // Assume indices has same shape as inference, because - // it's the output of topk. - - PADDLE_ENFORCE_EQ( - label_dim.size(), 2, - platform::errors::InvalidArgument( - "ShapeError: label's dimensions of AccuracyOp must be 2. " - "But received label's dimensions = %d, label's shape = [%s]", - label_dim.size(), label_dim)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(label_dim[1], 1, - platform::errors::InvalidArgument( - "ShapeError: label's second dimension of " - "AccuracyOp must be 1. But received label's " - "second dimension is = %d, label's shape = [%s]", - label_dim[1], label_dim)); - PADDLE_ENFORCE_EQ( - inference_dim[0], label_dim[0], - platform::errors::InvalidArgument( - "ShapeError: the output's num_rows of AccuracyOp must be" - " the same as label's num_rows. But received output's " - "shape = [%s], label's shape = [%s], output's num_rows = %d, " - "label's " - "num_rows = %d", - inference_dim, label_dim, inference_dim[0], label_dim[0])); - } - - ctx->SetOutputDim("Accuracy", {1}); - ctx->SetOutputDim("Correct", {1}); - ctx->SetOutputDim("Total", {1}); - ctx->ShareLoD("Out", /*->*/ "Accuracy"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -123,13 +62,13 @@ with the input Out(Inference). } // namespace operators } // namespace paddle +// FIXME(typhoonzero): types of T is for infernece data. +// label data is always int. +DECLARE_INFER_SHAPE_FUNCTOR(accuracy, AccuracyInferShapeFunctor, + PD_INFER_META(phi::AccuracyInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR( accuracy, ops::AccuracyOp, ops::AccuracyOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -// FIXME(typhoonzero): types of T is for infernece data. -// label data is always int. -REGISTER_OP_CPU_KERNEL(accuracy, - ops::AccuracyKernel, - ops::AccuracyKernel); + paddle::framework::EmptyGradOpMaker, + AccuracyInferShapeFunctor); diff --git a/paddle/fluid/operators/metrics/accuracy_op.cu b/paddle/fluid/operators/metrics/accuracy_op.cu deleted file mode 100644 index 6f19100fa9d37e2efedad60a982bf19b09cac736..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.cu +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/operators/metrics/accuracy_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void AccuracyCudaKernel(const int N, const int D, - const int64_t* Xdata, - const int64_t* labeldata, int* correct_data, - float* accuracy, int* total_data) { - int count = 0; - __shared__ int total[BlockSize]; - - // support only 1 block - for (int i = threadIdx.x; i < (N); i += BlockSize) { - for (int j = 0; j < D; ++j) { - if (Xdata[i * D + j] == labeldata[i]) { - ++count; - break; - } - } - } - total[threadIdx.x] = count; - __syncthreads(); - -// reduce the count with init value 0, and output accuracy. -#ifdef PADDLE_WITH_CUDA - int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); -#else - // HIP thrust::reduce not support __device__ - for (int s = BlockSize / 2; s > 0; s >>= 1) { - if (threadIdx.x < s) { - total[threadIdx.x] += total[threadIdx.x + s]; - } - __syncthreads(); - } - int result = total[0]; -#endif - if (threadIdx.x == 0) { - *correct_data = result; - *accuracy = static_cast(result) / static_cast(N); - *total_data = N; - } -} - -template -class AccuracyOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - // FIXME(typhoonzero): only support indices currently - // if add support for output values, how to detect the data type? - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - int num_samples = static_cast(inference->dims()[0]); - size_t infer_width = inference->dims()[1]; - auto stream = ctx.cuda_device_context().stream(); - platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream); - - if (num_samples == 0) { - return; - } - - AccuracyCudaKernel< - PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - num_samples, infer_width, indices_data, label_data, correct_data, - accuracy_data, total_data); - } -}; - -} // namespace operators -} // namespace paddle - -// FIXME(typhoonzero): types of T is for inference data. -// label data is always int64 -REGISTER_OP_CUDA_KERNEL( - accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/fluid/operators/metrics/accuracy_op.h b/paddle/fluid/operators/metrics/accuracy_op.h deleted file mode 100644 index 94e5bf8257e67b9fd01aa9ae45a25d90963fef13..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/metrics/accuracy_op.h +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class AccuracyKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* inference = ctx.Input("Out"); - auto* indices = ctx.Input("Indices"); - auto* label = ctx.Input("Label"); - auto* accuracy = ctx.Output("Accuracy"); - auto* correct = ctx.Output("Correct"); - auto* total = ctx.Output("Total"); - - int* correct_data = correct->mutable_data(ctx.GetPlace()); - int* total_data = total->mutable_data(ctx.GetPlace()); - float* accuracy_data = accuracy->mutable_data(ctx.GetPlace()); - - const int64_t* indices_data = indices->data(); - const int64_t* label_data = label->data(); - - size_t num_samples = inference->dims()[0]; - size_t class_dim = inference->dims()[1]; - *accuracy_data = 0.0f; - - if (num_samples == 0) { - return; - } - - int num_correct = 0; - // assume inference is already the topk of the output - for (size_t i = 0; i < num_samples; ++i) { - PADDLE_ENFORCE_GE( - label_data[i], 0, - platform::errors::InvalidArgument( - "label of AccuracyOp must >= 0, But received label[%d] is %d", i, - label_data[i])); - for (size_t j = 0; j < class_dim; ++j) { - if (indices_data[i * class_dim + j] == label_data[i]) { - ++num_correct; - break; - } - } - } - - *correct_data = num_correct; - *total_data = num_samples; - *accuracy_data = - static_cast(num_correct) / static_cast(num_samples); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 2598d3b0277c94a52e1fa14b04c00b595071f312..1ce02ff4525c9692f88ed42b79ff336cc0113c41 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index e83278f88b82a31eb445a0a86e3003e96acf395e..9f2ca4165f33a28902bfe20207b12bad2af49fad 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -13,7 +13,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc index de71312d78df99adc3b3663f2fcbb3943373982e..3cc1be4de8a82ff263824ab4852178f735596d45 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc @@ -14,12 +14,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/metrics/accuracy_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device/xpu/xpu_header.h" namespace paddle { namespace operators { +using Tensor = paddle::framework::Tensor; template class AccuracyXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc index 54ecba08a82dcea9482314cc0a26b3ce2d07ec4f..f3ed98c3f4d1e47a8b7dff81a998c7574859baa2 100644 --- a/paddle/fluid/operators/metrics/auc_op.cc +++ b/paddle/fluid/operators/metrics/auc_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { @@ -21,70 +24,6 @@ class AucOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Predict"), "Input", "Predict", "Auc"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "Auc"); - auto predict_dims = ctx->GetInputDim("Predict"); - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE_GE( - predict_dims.size(), 2, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape size must be " - "greater_equal 2.", - predict_dims)); - auto predict_width = predict_dims[1]; - PADDLE_ENFORCE_NE( - phi::product(predict_dims), 0, - platform::errors::InvalidArgument( - "The Input(Predict) has not been initialized properly. The " - "shape of Input(Predict) = [%s], the shape can not involes 0.", - predict_dims)); - PADDLE_ENFORCE_NE( - phi::product(label_dims), 0, - platform::errors::InvalidArgument( - "The Input(Label) has not been initialized properly. The " - "shape of Input(Label) = [%s], the shape can not involes 0.", - label_dims)); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_LE(predict_width, 2, - platform::errors::InvalidArgument( - "Only support binary classification," - "prediction dims[1] should be 1 or 2")); - } - auto predict_height = ctx->GetInputDim("Predict")[0]; - auto label_height = ctx->GetInputDim("Label")[0]; - - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(predict_height, label_height, - platform::errors::InvalidArgument( - "Out and Label should have same height.")); - } - - int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; - int slide_steps = ctx->Attrs().Get("slide_steps"); - - PADDLE_ENFORCE_GE( - num_pred_buckets, 1, - platform::errors::InvalidArgument("num_thresholds must larger than 1")); - PADDLE_ENFORCE_GE(slide_steps, 0, - platform::errors::InvalidArgument( - "slide_steps must be natural number")); - - ctx->SetOutputDim("AUC", {1}); - - if (slide_steps) { - ctx->SetOutputDim("StatPosOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - ctx->SetOutputDim("StatNegOut", - {(1 + slide_steps) * num_pred_buckets + 1}); - } else { - ctx->SetOutputDim("StatPosOut", {1, num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {1, num_pred_buckets}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -145,4 +84,7 @@ There are two types of possible curves: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker); +DECLARE_INFER_SHAPE_FUNCTOR(auc, AucInferShapeFunctor, + PD_INFER_META(phi::AucInferMeta)); +REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker, + AucInferShapeFunctor); diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc index 780c6e7f153e7b1179e203bc7807dd7818aa591a..a3b764b0e1c46ab91b989ed7f7b0b5df101f7654 100644 --- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc @@ -13,19 +13,32 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" namespace paddle { namespace operators { -using paddle::framework::Tensor; +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; template -class ShapeMKLDNNKernel : public ShapeKernel { +class ShapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ShapeKernel::Compute(ctx); + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } auto* out = ctx.Output("Out"); out->set_layout(framework::DataLayout::kMKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 05cd264cf3ec9ee6e47d822d7e4d79ab7cd64441..23428dd403e9b1ef62007c7b9193ed3b8482cab3 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index c776cf2a7c792c429fcf45a367d3f06bf9add5d2..e9dadd5ec937cd11c84777a582cc1f7ac9fc3c33 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -27,7 +27,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc index 3791fed23a84ff51d022dd24a6a0734a39636a70..9d0062e31388413fd4a441687631faebe8846c6e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc @@ -24,14 +24,17 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP(pool2d); USE_OP_DEVICE_KERNEL(pool2d, MKLDNN); -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); USE_OP_ITSELF(transpose); USE_OP_DEVICE_KERNEL(transpose, MKLDNN); +PD_DECLARE_KERNEL(relu, CPU, ALL_LAYOUT); + namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc index 884521301750ce92c3f0a2e0b9468c5cc4a57790..6e3bd5e43c9c1d7e5c8a5dd4ba37afcfd7147e20 100644 --- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc +++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc @@ -22,7 +22,7 @@ limitations under the License. */ namespace fw = paddle::framework; namespace plat = paddle::platform; -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_DEVICE_KERNEL(relu, MLU); // relu diff --git a/paddle/fluid/operators/mode_op.cu b/paddle/fluid/operators/mode_op.cu index afb949d3374c62f561e910ea77e516bdb4004ac0..2bacda8afb0eb340c4c8d4068f3013e2adbc7f91 100644 --- a/paddle/fluid/operators/mode_op.cu +++ b/paddle/fluid/operators/mode_op.cu @@ -24,7 +24,6 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mode_op.h" #include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/nll_loss_op.cc b/paddle/fluid/operators/nll_loss_op.cc index f510c7bebec876d034c1af923a4f7077c096000c..a4e1f7b3091a9f692e479300310333bfdd359096 100644 --- a/paddle/fluid/operators/nll_loss_op.cc +++ b/paddle/fluid/operators/nll_loss_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/nll_loss_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -23,77 +25,6 @@ class NLLLossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "NLLLoss"); - OP_INOUT_CHECK(ctx->HasOutput("Total_weight"), "Output", "Total_weight", - "NLLLoss"); - - auto x_dims = ctx->GetInputDim("X"); - auto label_dims = ctx->GetInputDim("Label"); - auto reduction = ctx->Attrs().Get("reduction"); - - PADDLE_ENFORCE_EQ(x_dims.size() == 2 || x_dims.size() == 4, true, - platform::errors::InvalidArgument( - "The tensor rank of Input(X) must be 2 or 4.")); - bool contain_unknown_dim = phi::contain_unknown_dim(x_dims) || - phi::contain_unknown_dim(label_dims); - bool check = ctx->IsRuntime() || !contain_unknown_dim; - if (check) { - PADDLE_ENFORCE_EQ( - x_dims[0], label_dims[0], - platform::errors::InvalidArgument( - "ShapeError: Expected input batch_size to match label batch_size," - "But received: the Input(x) batch_size is [%s], the Input(label) " - " batch_size is [%s].", - x_dims[0], label_dims[0])); - if (ctx->HasInput("Weight")) { - auto w_dims = ctx->GetInputDim("Weight"); - PADDLE_ENFORCE_EQ(w_dims.size(), 1, - platform::errors::InvalidArgument( - "Input(Weight) should be a 1D tensor.")); - PADDLE_ENFORCE_EQ( - x_dims[1], w_dims[0], - platform::errors::InvalidArgument( - "Expected input tensor Weight's size should equal " - "to the first dimension of the input tensor X. But received " - "Weight's " - "size is %d, the first dimension of input X is %d", - w_dims[0], x_dims[1])); - } - } - if (x_dims.size() == 2) { - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } else if (x_dims.size() == 4) { - PADDLE_ENFORCE_EQ(label_dims.size(), 3, - platform::errors::InvalidArgument( - "Expected Input(Lable) dimensions=3, received %d.", - label_dims.size())); - auto input0 = x_dims[0]; - auto input2 = x_dims[2]; - auto input3 = x_dims[3]; - auto label0 = label_dims[0]; - auto label1 = label_dims[1]; - auto label2 = label_dims[2]; - PADDLE_ENFORCE_EQ( - input0 == label0 && input2 == label1 && input3 == label2, true, - platform::errors::InvalidArgument("Input(X) tensor shape should " - "match to Input(Label) tensor " - "shape.")); - if (reduction == "none") { - ctx->SetOutputDim("Out", {x_dims[0], x_dims[2], x_dims[3]}); - } else { - ctx->SetOutputDim("Out", {1}); - } - } - ctx->SetOutputDim("Total_weight", {1}); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -259,15 +190,11 @@ class NLLLossGradMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DECLARE_INFER_SHAPE_FUNCTOR(nll_loss, NllLossRawInferShapeFunctor, + PD_INFER_META(phi::NllLossRawInferMeta)); namespace ops = paddle::operators; REGISTER_OPERATOR(nll_loss, ops::NLLLossOp, ops::NLLLossOpMaker, ops::NLLLossGradMaker, - ops::NLLLossGradMaker); + ops::NLLLossGradMaker, + NllLossRawInferShapeFunctor); REGISTER_OPERATOR(nll_loss_grad, ops::NLLLossGradOp); -REGISTER_OP_CPU_KERNEL( - nll_loss, ops::NLLLossOpKernel, - ops::NLLLossOpKernel); -REGISTER_OP_CPU_KERNEL( - nll_loss_grad, - ops::NLLLossGradOpKernel, - ops::NLLLossGradOpKernel); diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h deleted file mode 100644 index be6f4422d4ac6a475477c025c4b76eabdbf4f9e0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/nll_loss_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int64_t i = 0; i < batch_size; ++i) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "Label value is out of range. " - "Expected label value in range of [0, %d), but " - "received value is %d.", - n_classes, cur_label)); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[i] = -x_data[i * n_classes + cur_label] * cur_weight; - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int64_t i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - out_data[i] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= x_data[i * n_classes + cur_label] * cur_weight; - } - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -static void nll_loss_2D(T* out_data, T* total_weight_data, const T* x_data, - const int64_t* label_data, const T* weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - out_data[index] = -x_data[i * sample_size + cur_label * map_size + - h * in_dim3 + w] * - cur_weight; - } - } - } - return; - } - - T output_val = 0; - T total_weight_val = 0; - - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - out_data[index] = 0; - continue; - } - PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, - platform::errors::InvalidArgument( - "label should not be out of bounds.")); - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - total_weight_val += cur_weight; - output_val -= - x_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] * - cur_weight; - } - } - } - - if (reduction == "mean" && total_weight_val != 0) { - output_val /= total_weight_val; - } - *out_data = output_val; - *total_weight_data = total_weight_val; -} - -template -class NLLLossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* out = ctx.Output("Out"); - auto* total_weight = ctx.Output("Total_weight"); - auto reduction = ctx.Attr("reduction"); - auto ignore_index = ctx.Attr("ignore_index"); - - auto x_data = x->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto out_data = out->mutable_data(ctx.GetPlace()); - auto total_weight_data = total_weight->mutable_data(ctx.GetPlace()); - *total_weight_data = 0; - - auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_1D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_2D(out_data, total_weight_data, x_data, label_data, - weight_data, batch_size, n_classes, in_dim2, in_dim3, - reduction, ignore_index); - } - } -}; - -template -static void nll_loss_grad_1D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const std::string reduction, - const int64_t ignore_index) { - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_data[i] * cur_weight; - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - const auto cur_label = label_data[i]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * n_classes + cur_label] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[i * n_classes + cur_label] /= total_weight_val; - } - } -} - -template -static void nll_loss_grad_2D(T* dx_data, const T* dout_data, - const int64_t* label_data, const T* weight_data, - const T* total_weight_data, - const int64_t batch_size, const int64_t n_classes, - const int64_t in_dim2, const int64_t in_dim3, - const std::string reduction, - const int64_t ignore_index) { - const auto map_size = in_dim2 * in_dim3; - const auto sample_size = n_classes * map_size; - - if (reduction == "none") { - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - dx_data[i * sample_size + cur_label * map_size + h * in_dim3 + w] = - -cur_weight * dout_data[index]; - } - } - } - return; - } - - const T dout_val = *dout_data; - const T total_weight_val = *total_weight_data; - for (int i = 0; i < batch_size; i++) { - for (int h = 0; h < in_dim2; h++) { - for (int w = 0; w < in_dim3; w++) { - const auto index = i * map_size + h * in_dim3 + w; - const auto cur_label = label_data[index]; - if (cur_label == ignore_index) { - continue; - } - const auto cur_weight = - weight_data ? weight_data[cur_label] : static_cast(1); - const auto dx_index = - i * sample_size + cur_label * map_size + h * in_dim3 + w; - dx_data[dx_index] = -dout_val * cur_weight; - if (reduction == "mean") { - dx_data[dx_index] /= total_weight_val; - } - } - } - } -} - -template -class NLLLossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* weight = ctx.Input("Weight"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* total_weight = ctx.Input("Total_weight"); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto ignore_index = ctx.Attr("ignore_index"); - auto reduction = ctx.Attr("reduction"); - - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto label_data = labels->data(); - auto weight_data = weight ? weight->data() : nullptr; - auto total_weight_data = total_weight->data(); - memset(dx_data, 0, dx->numel() * sizeof(T)); - - const auto x_dims = x->dims(); - const auto batch_size = x_dims[0]; - const auto n_classes = x_dims[1]; - - if (x_dims.size() == 2) { - nll_loss_grad_1D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, reduction, - ignore_index); - } else if (x_dims.size() == 4) { - const auto in_dim2 = x_dims[2]; - const auto in_dim3 = x_dims[3]; - nll_loss_grad_2D(dx_data, dout_data, label_data, weight_data, - total_weight_data, batch_size, n_classes, in_dim2, - in_dim3, reduction, ignore_index); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc index b96fcaa486cce8099cf1d03c7d948ea74c1923ad..372a71706ab5ec72b6da4cbac1b63333f42cb265 100644 --- a/paddle/fluid/operators/op_debug_string_test.cc +++ b/paddle/fluid/operators/op_debug_string_test.cc @@ -17,8 +17,10 @@ #include "glog/logging.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/kernel_registry.h" USE_OP_ITSELF(elementwise_add_grad); +PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 229e61ac9fe79d3c171d1f0612f22f3590587231..dc162ae5782f2690fcf6378603268369e4aeb9ca 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -28,37 +30,6 @@ class PadOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Pad"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Pad"); - - auto x_dim = ctx->GetInputDim("X"); - auto& paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE_EQ( - static_cast(paddings.size()), x_dim.size() * 2, - platform::errors::InvalidArgument( - "Size of 'paddings' dimension should be equal to 2 * size of " - "Input(X)'s dimension, but received (size of 'paddings' dimension " - "is) %d vs (2 * size of Input(X)'s dimension is) %d.", - static_cast(paddings.size()), x_dim.size() * 2)); - for (size_t i = 0; i < paddings.size(); ++i) { - PADDLE_ENFORCE_GE(paddings[i], 0, - platform::errors::InvalidArgument( - "The element of 'paddings' should >= 0, but " - "received %d for index %d.", - paddings[i], static_cast(i))); - } - std::vector out_dims(x_dim.size()); - for (int i = 0; i < x_dim.size(); ++i) { - if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) { - out_dims[i] = -1; - } else { - out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1]; - } - } - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - if (out_dims[0] == x_dim[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. - ctx->ShareLoD("X", /*->*/ "Out"); - } } }; @@ -160,10 +131,13 @@ class PadOpDoubleGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(pad, PadInferShapeFunctor, + PD_INFER_META(phi::PadInferMeta)); REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker, - ops::PadOpGradMaker); + ops::PadOpGradMaker, + PadInferShapeFunctor); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad, ops::PadOpDoubleGradMaker, ops::PadOpDoubleGradMaker); diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc index 6b0d6f332bcae8890cdfaccb1244886daa63ae42..54e31845ad4bd5ddfa81bc90a10391f027dffc11 100644 --- a/paddle/fluid/operators/put_along_axis_op.cc +++ b/paddle/fluid/operators/put_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/put_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -123,16 +124,3 @@ REGISTER_OPERATOR(put_along_axis, ops::PutAlongAxisOp, ops::PutAlongAxisOpMaker, paddle::operators::PutAlongAxisInplaceInferer); REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(put_along_axis, ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel, - ops::PutAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel, - ops::PutAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu deleted file mode 100644 index 5508023efad2c60a00f5ea3a8d1b853c6e5ba1fb..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/put_along_axis_op.cu +++ /dev/null @@ -1,134 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/put_along_axis_op.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PutAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisCUDAKernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - const platform::DeviceContext &device_ctx = ctx.device_context(); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'addā€˜, 'assign', 'mul' and 'multiply', the " - "defalut reduce op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpCUDAKernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - gpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel( - *result_grad, axis, *index, *value_grad, - ctx.device_context()); // the gradient of scatter is gather - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(put_along_axis, ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel, - ops::PutAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(put_along_axis_grad, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel, - ops::PutAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h deleted file mode 100644 index 38487f5ce28c9e35dd6e84403b88dbc0fdfa07b3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/put_along_axis_op.h +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class PutAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisOpKernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto value = ctx.Input("Value"); - auto index = ctx.Input("Index"); - auto reduce_op = ctx.Attr("Reduce"); - auto result = ctx.Output("Result"); - - framework::TensorCopy(*input, ctx.GetPlace(), result); - const platform::DeviceContext &device_ctx = ctx.device_context(); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (reduce_op == "add") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "multiply" || reduce_op == "mul") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_mul_kernel(*result, axis, *index, *value, - device_ctx); - } - } else if (reduce_op == "assign") { - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_assign_kernel(*result, axis, *index, *value, - device_ctx); - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not support reduce_op: '%s' for scatter kernel, only " - "support reduce op: 'addā€˜, 'assign', 'mul' and 'multiply', the " - "defalut reduce " - "op is 'assign' ", - reduce_op)); - return; - } - } -}; - -template -class PutAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "PutAlongAxisGradOpKernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto value_grad = ctx.Output(framework::GradVarName("Value")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (input_grad) { - framework::TensorCopy(*result_grad, ctx.GetPlace(), input_grad); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_input_grad_kernel( - // Here passing an unused argument *result_grad, because it's - // convenient to instantiate a bunch of template function with the - // same arguments list. - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } else { - cpu_scatter_input_grad_kernel( - *result_grad, axis, *index, *input_grad, ctx.device_context()); - } - } - - if (value_grad) { - value_grad->Resize(index->dims()); - value_grad->mutable_data(ctx.GetPlace()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*result_grad, axis, *index, *value_grad, - ctx.device_context()); - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index 24741efe426b18b7cecae9332c522d67aee98d63..c7e91ba35dee1356ddd71ade0fe9892f8032c77b 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 21c23a7f602a35acf676e97a9134c2c43a73126c..4b6759ea165edf29add66ee44461fdd4d9f84d00 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -70,9 +70,25 @@ BufferedReader::BufferedReader( stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + int dev_idx = place_.device; + compute_stream_ = + ((platform::MLUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::MluEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx); + } +#endif cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); + mlu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -256,6 +272,56 @@ void BufferedReader::ReadAsync(size_t i) { platform::NPUStreamSync(stream_.get()); } #endif + +#ifdef PADDLE_WITH_MLU + if (platform::is_mlu_place(place_)) { + TensorVec &mlu = mlu_buffer_[i]; + if (mlu.empty()) { + mlu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + mlu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on MLU and CPU devices are not matched. " + "The number on MLU is %d, on CPU is %d", + mlu.size(), cpu.size())); + } + + std::vector mlu_ptrs; + mlu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + mlu[i].Resize(cpu[i].dims()); + mlu[i].set_layout(cpu[i].layout()); + mlu_ptrs.emplace_back(mlu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetMLUDeviceId(place_.device); + PADDLE_ENFORCE_MLU_SUCCESS( + cnPlaceNotifier(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_MLU_SUCCESS(cnWaitNotifier(events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy", + platform::TracerEventType::UserDefined, + 1); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto mlu_ptr = mlu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype()); + if ((platform::is_mlu_place(cpu_place))) { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + } else { + memory::Copy(place_, mlu_ptr, cpu_place, cpu_ptr, size, + stream_.get()); + platform::MLUStreamSync(stream_.get()); + } + mlu[i].set_lod(cpu[i].lod()); + } + platform::MLUStreamSync(stream_.get()); + } +#endif return i; })); } @@ -291,6 +357,8 @@ void BufferedReader::ReadNextImpl(std::vector *out) { *out = std::move(cuda_buffer_[i]); } else if (platform::is_npu_place(place_)) { *out = std::move(npu_buffer_[i]); + } else if (platform::is_mlu_place(place_)) { + *out = std::move(mlu_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); } diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 3d42486c6df8815aaab8e55e29898700bb74d953..f0f3b6b7f9fdfeb69c46e7122fae5c6cfbf3a169 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -29,6 +29,11 @@ #include "paddle/fluid/platform/device/npu/npu_info.h" #include "paddle/fluid/platform/device/npu/npu_resource_pool.h" #endif +#ifdef PADDLE_WITH_MLU +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" +#endif + namespace paddle { namespace operators { namespace reader { @@ -70,6 +75,7 @@ class BufferedReader : public framework::DecoratedReader { std::vector cpu_buffer_; std::vector cuda_buffer_; std::vector npu_buffer_; + std::vector mlu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; @@ -82,6 +88,12 @@ class BufferedReader : public framework::DecoratedReader { std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_MLU + mluStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc index cb438b4a8057267015c8b3c15dd8468fca5a4b44..41df8e4a15f093a40a31c70eea98dfb7e575f4cd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cc @@ -14,15 +14,28 @@ #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" -REGISTER_REDUCE_OP(reduce_max); -REGISTER_OP_CPU_KERNEL( - reduce_max, ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + +namespace ops = paddle::operators; + +class ReduceMaxOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_max"; } + virtual std::string GetOpType() const { return "Reduce reduce_max"; } +}; + +DECLARE_INFER_SHAPE_FUNCTOR(reduce_max, ReduceMaxInferShapeFunctor, + PD_INFER_META(phi::ReduceInferMetaBase)); + +REGISTER_OPERATOR( + reduce_max, ops::ReduceOp, ReduceMaxOpMaker, + paddle::framework::DefaultGradOpMaker, + paddle::framework::DefaultGradOpMaker, + ReduceMaxInferShapeFunctor); +REGISTER_OPERATOR(reduce_max_grad, ops::ReduceGradOp) + REGISTER_OP_CPU_KERNEL( reduce_max_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index 894106883cb0a09fed0e2335144cc3867fc99cc7..4a18330913803f822436118a35fb957b7e31b391 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -97,7 +97,7 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { }; DECLARE_INFER_SHAPE_FUNCTOR(reduce_mean, ReduceMeanInferShapeFunctor, - PD_INFER_META(phi::MeanRawInferMeta)); + PD_INFER_META(phi::ReduceInferMetaBase)); REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradMaker, diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index eb76eee104889042e470e65414a011afd0420d0f..160617695338a9f2e140b7b418c93ef0d7c57e17 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -36,9 +36,9 @@ void TensorReduceImpl(const platform::CUDADeviceContext& dev_ctx, gpuStream_t stream) { y->mutable_data(x.place()); - phi::funcs::TensorReduceImpl( + phi::funcs::ReduceKernel( static_cast(dev_ctx), x, y, transform, - origin_reduce_dims, stream); + origin_reduce_dims); } } // namespace operators diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 6559ed479c84cacc3cf90ea7a32eb703da6bb602..2a78774f3706e73bd8931e80fe020faac58d7ff5 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -103,7 +103,7 @@ class ReduceSumOpMaker : public ops::ReduceOpMaker { }; DECLARE_INFER_SHAPE_FUNCTOR(reduce_sum, ReduceSumInferShapeFunctor, - PD_INFER_META(phi::ReduceInferMetaBase)); + PD_INFER_META(phi::SumRawInferMeta)); REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, ops::ReduceSumVarTypeInference, @@ -114,16 +114,3 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumDoubleOpGradMaker, ops::ReduceSumGradNoNeedBufferVarInferer); - -template -using CPUReduceSumGradKernel = - ops::ReduceSumGradKernel; - -REGISTER_OP_CPU_KERNEL( - reduce_sum_grad, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel, CPUReduceSumGradKernel, - CPUReduceSumGradKernel>, - CPUReduceSumGradKernel>); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu deleted file mode 100644 index 2f6bf127518090916c4b947daf1d1f202fdd5960..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" -#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" - -template -using CUDAReduceSumGradKernel = - ops::ReduceCudaGradKernel; - -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel>, - CUDAReduceSumGradKernel>); diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h index b636184ae457edf5c8028fecfb92a3ea96f5a0d9..a473b54c1f855945a5f3f0ac8d0826b15494ba1a 100644 --- a/paddle/fluid/operators/rnn_op.h +++ b/paddle/fluid/operators/rnn_op.h @@ -16,9 +16,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/unique_op.h" @@ -36,6 +36,14 @@ using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; using TensorList = std::vector; +template +using EigenMatrix = framework::EigenMatrix; + +template +using EigenVector = framework::EigenVector; + #define DEFINE_MODE_DETECTOR(MODE_NAME, MODE_STR) \ inline bool is_##MODE_NAME(const framework::ExecutionContext& ctx) { \ const std::string& mode = ctx.Attr("mode"); \ diff --git a/paddle/fluid/operators/segment_pool_op.cc b/paddle/fluid/operators/segment_pool_op.cc index 322cd97f01c3ad97ba74f049696fdec592ee524e..9d4c8532a82c064b1b7aef759934ad8dad894ec5 100644 --- a/paddle/fluid/operators/segment_pool_op.cc +++ b/paddle/fluid/operators/segment_pool_op.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/segment_pool_op.h" #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -23,22 +26,6 @@ class SegmentPoolOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SegmentPool"); - OP_INOUT_CHECK(ctx->HasInput("SegmentIds"), "Input", "SegmentIds", - "SegmentPool"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SegmentPool"); - auto dims = ctx->GetInputDim("X"); - dims[0] = -1; - ctx->SetOutputDim("Out", dims); - - if (ctx->Attrs().Get("pooltype") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("SummedIds"), "Output", "SummedIds", - "SegmentPool"); - ctx->SetOutputDim("SummedIds", {-1, 1}); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -150,17 +137,11 @@ class SegmentPoolGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR(segment_pool, SegmentPoolInferShapeFunctor, + PD_INFER_META(phi::SegmentPoolInferMeta)); + REGISTER_OPERATOR(segment_pool, ops::SegmentPoolOp, ops::SegmentPoolOpMaker, ops::SegmentPoolGradOpMaker, - ops::SegmentPoolGradOpMaker); + ops::SegmentPoolGradOpMaker, + SegmentPoolInferShapeFunctor); REGISTER_OPERATOR(segment_pool_grad, ops::SegmentPoolGradOp); - -REGISTER_OP_CPU_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); - -REGISTER_OP_CPU_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.cu b/paddle/fluid/operators/segment_pool_op.cu deleted file mode 100644 index e147e62a98354087121ca1443b20d9163ef00f73..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/segment_pool_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/segment_pool_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - segment_pool, - ops::SegmentPoolKernel, - ops::SegmentPoolKernel); -REGISTER_OP_CUDA_KERNEL( - segment_pool_grad, - ops::SegmentPoolGradKernel, - ops::SegmentPoolGradKernel); diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h deleted file mode 100644 index 2f5ef7f54f988884a25feba4665283d3ce260988..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/segment_pool_op.h +++ /dev/null @@ -1,176 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/segment_pooling.h" -#include "paddle/fluid/platform/macros.h" -#include "paddle/phi/common/place.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) { - auto* input = context.Input("X"); - auto* segment = context.Input("SegmentIds"); - auto* output = context.Output("Out"); - std::string pooltype = context.Attr("pooltype"); - Tensor* summed_ids = nullptr; - - int64_t num_indices = segment->numel(); - PADDLE_ENFORCE_EQ( - num_indices, input->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be the same size as dimension 0 of input X.")); - PADDLE_ENFORCE_EQ(num_indices, segment->dims()[0], - platform::errors::InvalidArgument( - "Segment_ids should be 1-D tensor, or it's other " - "dimension size is 1. Segment_ids's shape is: [%s].", - segment->dims())); - - if (input->numel() == 0 || segment->numel() == 0) { - return; - } - - bool cpu_place = context.GetPlace().GetType() == phi::AllocationType::CPU; - if (cpu_place) { - auto dims = input->dims(); - auto* segment_ids = segment->data(); - dims[0] = static_cast(segment_ids[segment->numel() - 1] + 1); - PADDLE_ENFORCE_GT( - dims[0], 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", dims[0])); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, output, static_cast(0)); - } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (!cpu_place) { - Tensor length; - length.mutable_data(phi::make_ddim({1}), platform::CPUPlace()); - IndexT* length_data = length.data(); - const IndexT* segment_ids = segment->data(); - -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - hipMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - hipMemcpyDeviceToHost)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - cudaMemcpy(length_data, segment_ids + num_indices - 1, sizeof(IndexT), - cudaMemcpyDeviceToHost)); -#endif - - IndexT length_host = length_data[0]; - length_host++; - PADDLE_ENFORCE_GT( - length_host, 0, - platform::errors::InvalidArgument( - "Segment ids must be >= 0, but got last id %d", length_data[0])); - auto dims = input->dims(); - dims[0] = static_cast(length_host); - output->Resize({dims}); - output->mutable_data(context.GetPlace()); - T init_value = 0; - if (pooltype == "MAX") { - init_value = static_cast(-FLT_MAX); - } else if (pooltype == "MIN") { - init_value = static_cast(FLT_MAX); - } - phi::funcs::SetConstant setconst; - auto& dev_ctx = context.template device_context(); - setconst(dev_ctx, output, static_cast(init_value)); - // the gpu kernel of mean pool record the counts of segment_ids - if (pooltype == "MEAN") { - summed_ids = context.Output("SummedIds"); - summed_ids->Resize({dims[0], 1}); - summed_ids->mutable_data(context.GetPlace()); - setconst(dev_ctx, summed_ids, static_cast(1e-12)); - } - } -#endif - - SegmentPoolFunctor pool; - - pool(context.template device_context(), *input, *segment, - output, summed_ids, pooltype); -} - -template -class SegmentPoolKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* segment = context.Input("SegmentIds"); - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentKernelLaunchHelper(context); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentKernelLaunchHelper(context); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -template -class SegmentPoolGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input = context.Input("X"); - auto* output = context.Input("Out"); - auto* segment = context.Input("SegmentIds"); - auto* out_g = context.Input(framework::GradVarName("Out")); - auto* in_g = context.Output(framework::GradVarName("X")); - std::string pooltype = context.Attr("pooltype"); - - const Tensor* summed_ids = nullptr; - if (pooltype == "MEAN") { - summed_ids = context.Input("SummedIds"); - } - - in_g->mutable_data(context.GetPlace()); - phi::funcs::SetConstant set_zero; - auto& dev_ctx = context.template device_context(); - set_zero(dev_ctx, in_g, static_cast(0)); - - auto index_type = framework::TransToProtoVarType(segment->dtype()); - if (index_type == framework::proto::VarType::INT32) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else if (index_type == framework::proto::VarType::INT64) { - SegmentPoolGradFunctor pool; - pool(context.template device_context(), *input, *output, - *out_g, *segment, in_g, summed_ids, pooltype); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported index type, Expected int, int64, but got %s.", - index_type)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc index 6c33ff52044b26b598f835ee40462a01077c1ff8..23c6a0133e1edafba5621825db78a52b88e6947a 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -184,9 +184,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); if (in_g || filter_g) { - int r = xpu::constant(xpu_context, col_data, col_numel, T(0)); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); - bool trans_a = false; bool trans_b = true; int m = out_g->dims()[0]; @@ -208,7 +205,7 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { const T* data_b = filter->data(); T* data_c = col_data; - r = xpu::fc_fusion( + int r = xpu::fc_fusion( xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, xpu::Activation_t::LINEAR); @@ -222,7 +219,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - xpu::constant(xpu_context, in_g->data(), in_g->numel(), T(0)); int r = xpu::sequence_context_projection_grad( xpu_context, in_g->data(), col_data, nullptr, lodx, sequence_width, @@ -232,8 +228,6 @@ class SequenceConvGradXPUKernel : public framework::OpKernel { if (filter_g) { filter_g->mutable_data(context.GetPlace()); - xpu::constant(xpu_context, filter_g->data(), filter_g->numel(), - T(0)); int r = xpu::sequence_context_projection( xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index ec3e04e71faf0b20950d87de1a7f066e2e49310a..7d0d782b837c4c828996e993634373ab38d88eac 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -241,13 +241,6 @@ REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, ops::SetValueGradMaker, ops::SetValueOpInplaceInferer); -REGISTER_OP_CPU_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OPERATOR(set_value_grad, ops::SetValueGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/set_value_op.cu b/paddle/fluid/operators/set_value_op.cu index f9701b0acaac769bd91bbba156a010c2e05e42c3..9f291a863c067ae0210f44befb89191678291441 100644 --- a/paddle/fluid/operators/set_value_op.cu +++ b/paddle/fluid/operators/set_value_op.cu @@ -16,13 +16,6 @@ namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); - REGISTER_OP_CUDA_KERNEL( set_value_grad, ops::SetValueGradKernel, diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 4f7eb0357e9e12947accd2f28500c10ef858b697..a5ef7e8efbe7764a7d8292c07ad1047190500402 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -121,203 +121,6 @@ inline void CheckIsDimsMatch(const framework::DDim first, "of target shape: %d, but now shape is %d.", second.to_str(), first.to_str())); } - -template -class SetValueKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const { - const int rank = ctx.Input("Input")->dims().size(); - - // TODO(liym27): A more elegent code to do this. C++ has to make template - // integer as constant, but we had better have alternative writing in the - // future. - switch (rank) { - case 1: - SetValueCompute<1>(ctx); - break; - case 2: - SetValueCompute<2>(ctx); - break; - case 3: - SetValueCompute<3>(ctx); - break; - case 4: - SetValueCompute<4>(ctx); - break; - case 5: - SetValueCompute<5>(ctx); - break; - case 6: - SetValueCompute<6>(ctx); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "The rank of input should be less than 7, but received %d.", rank)); - } - } - - private: - template - void SetValueCompute(const framework::ExecutionContext& ctx) const { - auto* in = ctx.Input("Input"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto* out = ctx.Output("Out"); - - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); - - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); - auto steps = ctx.Attr>("steps"); - auto shape = ctx.Attr>("shape"); - auto decrease_axes = ctx.Attr>("decrease_axes"); - auto none_axes = ctx.Attr>("none_axes"); - - if (!starts_tensor_list.empty()) { - starts = GetDataFromTensorList(starts_tensor_list); - } - if (!ends_tensor_list.empty()) { - ends = GetDataFromTensorList(ends_tensor_list); - } - if (!steps_tensor_list.empty()) { - steps = GetDataFromTensorList(steps_tensor_list); - } - - auto in_dims = in->dims(); - phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = - phi::funcs::GetSliceDims(in_dims, axes, starts, ends, &steps); - auto decrease_slice_dims = - phi::funcs::GetDecreasedDims(slice_dims, decrease_axes); - - auto slice_dims_for_assign = decrease_slice_dims; - if (!none_axes.empty()) { - std::vector slice_dims_with_none; - - size_t none_axes_cur = 0, decrease_axes_cur = 0; - for (int i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= i) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - if (decrease_axes_cur < decrease_axes.size() && - decrease_axes[decrease_axes_cur] == i) { - decrease_axes_cur++; - } else { - slice_dims_with_none.push_back(slice_dims[i]); - } - } - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - - slice_dims_for_assign = phi::make_ddim(slice_dims_with_none); - } - - auto place = ctx.GetPlace(); - auto& eigen_place = - *ctx.template device_context().eigen_device(); - - // Here copy data from input to avoid data loss at PE and Graph level. - // TODO(liym27): Speed up in the future version. - // - Q: Why don't call ShareDataWith to speed up? - // - A: Because it's not supported to ShareDataWith on OP's input and output - // https://github.com/PaddlePaddle/Paddle/wiki/ShareDataWith-and-ShareBufferWith-are-prohibited-in-OP - // - Q: Why don't delete Input, after all, the input and output are the same - // Tensor at program level? - // - A: If deleting Input, the graph will be complex, such as there will - // be two ops points to the output in graph: op1 -> output <- set_value. - // In this case, we have to find a way to handle the running order of - // set_value is what we want. - paddle::framework::TensorCopy(*in, place, out); - - Tensor slice_tensor(in->dtype()), pad_tensor(in->dtype()); - slice_tensor.mutable_data(slice_dims, place); - pad_tensor.mutable_data(in_dims, place); - - auto pad_e = framework::EigenTensor::From(pad_tensor, in_dims); - auto out_e = framework::EigenTensor::From(*out); - auto slice_e = framework::EigenTensor::From(slice_tensor, slice_dims); - - // Step 1: Set the value of out at `_index` to zero - slice_e.device(eigen_place) = slice_e.constant(T(0)); - - auto starts_indices = Eigen::DSizes(); - auto ends_indices = Eigen::DSizes(); - auto strides_indices = Eigen::DSizes(); - - for (size_t i = 0; i < D; ++i) { - starts_indices[i] = 0; - ends_indices[i] = slice_dims[i]; - strides_indices[i] = 1; - } - for (size_t i = 0; i < axes.size(); i++) { - int axis_index = axes[i]; - starts_indices[axis_index] = starts[i]; - ends_indices[axis_index] = ends[i]; - strides_indices[axis_index] = steps[i]; - if (starts[i] == ends[i]) { // slice is empty, data will not be changed - return; - } - } - - out_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 2: Set a tensor with the same shape as out tensor. And its data at - // '_index' is the same as value_tensor, and data out of '_index' to zero - - // - Step 2.1 Set slice tensor with value - - // NOTE(liym27): [ Why resize slice_tensor here? ] - // A: When do broadcasting on slice_tensor and value_tensor, the shape of - // slice_tensor should be decreased dims. - // e.g. - // x[:,0] = value_tensor - // x's shape = [3, 4], value_tensor's shape = [3] - // We get slice_dims = [3, 1], decrease_slice_dims = [3] - // If do broadcasting on Tensor with shape [3, 1] and [3], the result's - // shape is [3, 3], which cross the border; - // If do broadcasting on Tensor with shape [3] and [3], the result's shape - // is [3], which is right. - - slice_tensor.Resize(slice_dims_for_assign); - if (value_tensor != nullptr) { - CheckIsDimsMatch(slice_dims_for_assign, value_tensor->dims()); - // ElementwiseComputeEx can do broadcasting - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); - } else { - Tensor value_t(in->dtype()); - auto value_dims = phi::make_ddim(shape); - CheckIsDimsMatch(slice_dims_for_assign, value_dims); - - value_t.mutable_data(value_dims, place); - auto value_name = - GetValueName(framework::TransToProtoVarType(in->dtype())); - CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); - value_t.Resize(value_dims); - ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_tensor, &value_t, -1, SubFunctor(), &slice_tensor); - } - slice_tensor.Resize(slice_dims); - - // - Step 2.2 Pad slice tensor with 0 - pad_e.device(eigen_place) = pad_e.constant(T(0)); - pad_e.stridedSlice(starts_indices, ends_indices, strides_indices) - .device(eigen_place) = slice_e; - - // Step 3: Set out tensor with value_tensor - out_e.device(eigen_place) = out_e - pad_e; - } -}; - template class SetValueGradKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 599697059c4dcfa54fa728a8ebf88ad95f387774..46d64333b608b7f3e7b3d83664978d162b6d6e52 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -174,6 +174,9 @@ class SetValueNPUKernel : public framework::OpKernel { .AddInput(std::move(index_indices)) .AddInput(val_temp) .AddOutput(out_temp) +#if (CANN_VERSION_CODE >= 504001) + .AddAttrs({{"use_locking", false}}) +#endif .Run(stream); } }; diff --git a/paddle/fluid/operators/shape_op.cc b/paddle/fluid/operators/shape_op.cc index 5b7ccdde81097a2cfd74c3d65c0679d277b766a3..e2c8359beb1290f7b1b592c1ff24b15986f41f73 100644 --- a/paddle/fluid/operators/shape_op.cc +++ b/paddle/fluid/operators/shape_op.cc @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -95,9 +93,3 @@ REGISTER_OPERATOR( shape, ops::ShapeOp, ops::ShapeOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, - ops::ShapeKernel>, - ops::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.cu b/paddle/fluid/operators/shape_op.cu deleted file mode 100644 index c6e380a94f84db7de53d0c218682813fcad0128d..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shape_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/shape_op.h" -#include "paddle/fluid/platform/complex.h" - -REGISTER_OP_CUDA_KERNEL( - shape, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel, - paddle::operators::ShapeKernel>, - paddle::operators::ShapeKernel>); diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h deleted file mode 100644 index 39ebcca46a710e0b817792105046af70b6298fc1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shape_op.h +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = phi::SelectedRows; - -template -class ShapeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_var = ctx.InputVar("Input"); - framework::DDim in_dims; - if (in_var->IsType()) { - in_dims = in_var->Get().value().dims(); - } else { - in_dims = in_var->Get().dims(); - } - auto* out_t = ctx.Output("Out"); - out_t->Resize({in_dims.size()}); - auto out_data = out_t->mutable_data(platform::CPUPlace()); - for (int i = 0; i < in_dims.size(); ++i) { - out_data[i] = in_dims[i]; - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shape_op_npu.cc b/paddle/fluid/operators/shape_op_npu.cc index 7bff7b2d668347692309d3695eb46b1fbdb6c7dd..f751ab41014c21fda2403bd69bcd20ad549e40c7 100644 --- a/paddle/fluid/operators/shape_op_npu.cc +++ b/paddle/fluid/operators/shape_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/shape_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/shape_op_xpu.cc b/paddle/fluid/operators/shape_op_xpu.cc index 2e9092a643253843ed09ab7475ec3ed723d5e3b8..a62d1b434e76434c3710e45e723060d3f452c91c 100644 --- a/paddle/fluid/operators/shape_op_xpu.cc +++ b/paddle/fluid/operators/shape_op_xpu.cc @@ -10,12 +10,41 @@ * limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include +#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/shape_op.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = phi::SelectedRows; + +template +class ShapeXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } + auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); + auto out_data = out_t->mutable_data(platform::CPUPlace()); + for (int i = 0; i < in_dims.size(); ++i) { + out_data[i] = in_dims[i]; + } + } +}; +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_XPU_KERNEL(shape, ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel, ops::ShapeKernel, - ops::ShapeKernel); +REGISTER_OP_XPU_KERNEL(shape, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel, + ops::ShapeXPUKernel, ops::ShapeXPUKernel); #endif diff --git a/paddle/fluid/operators/shard_index_op.cc b/paddle/fluid/operators/shard_index_op.cc index 54555e494ffe5f2c226c7aabd47b4ce991dab2ec..053a90f2fc9fa2f93c2647c420a046401198bc28 100644 --- a/paddle/fluid/operators/shard_index_op.cc +++ b/paddle/fluid/operators/shard_index_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -20,27 +23,6 @@ namespace operators { class ShardIndexOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShardIndex"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShardIndex"); - - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE_GE(x_dims.size(), 2, - platform::errors::InvalidArgument( - "Rank of Input(X) should be at least 2, " - "but the value given is %d.", - x_dims.size())); - if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) { - PADDLE_ENFORCE_EQ(x_dims[x_dims.size() - 1], 1U, - platform::errors::InvalidArgument( - "The last dimension of Input(X) should be 1, " - "but the value given is %d.", - x_dims[x_dims.size() - 1])); - } - - ctx->SetOutputDim("Out", x_dims); - ctx->ShareLoD("X", /* --> */ "Out"); - } protected: framework::OpKernelType GetExpectedKernelType( @@ -114,7 +96,10 @@ Examples: } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(shard_index, ops::ShardIndexOp, - ops::ShardIndexOpMaker); -REGISTER_OP_CPU_KERNEL(shard_index, ops::ShardIndexCPUKernel, - ops::ShardIndexCPUKernel); +DECLARE_INFER_SHAPE_FUNCTOR(shard_index, ShardIndexInferShapeFunctor, + PD_INFER_META(phi::ShardIndexInferMeta)); +REGISTER_OPERATOR( + shard_index, ops::ShardIndexOp, ops::ShardIndexOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ShardIndexInferShapeFunctor); diff --git a/paddle/fluid/operators/shard_index_op.cu b/paddle/fluid/operators/shard_index_op.cu deleted file mode 100644 index 115b3f47d664ba00228343d221d5be70d13a7ff1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shard_index_op.cu +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/shard_index_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" - -namespace paddle { -namespace operators { - -using platform::PADDLE_CUDA_NUM_THREADS; - -template -__global__ void ShardIndexInner(const T* in_data, T* out_data, - const int64_t numel, const int index_num, - const int nshards, const int shard_id, - const int ignore_value) { - int shard_size = (index_num + nshards - 1) / nshards; - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx < numel) { - assert(in_data[idx] >= 0 && in_data[idx] < index_num); - if (in_data[idx] / shard_size == shard_id) { - out_data[idx] = in_data[idx] % shard_size; - } else { - out_data[idx] = ignore_value; - } - } -} - -using LoDTensor = framework::LoDTensor; - -template -class ShardIndexCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - auto stream = - context.template device_context().stream(); - ShardIndexInner<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, numel, index_num, nshards, shard_id, ignore_value); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(shard_index, ops::ShardIndexCUDAKernel, - ops::ShardIndexCUDAKernel); diff --git a/paddle/fluid/operators/shard_index_op.h b/paddle/fluid/operators/shard_index_op.h deleted file mode 100644 index c2fe3711686d4c4c802fadd66d4bc994232ef5ec..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/shard_index_op.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using LoDTensor = framework::LoDTensor; -template -class ShardIndexCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); - auto* out = context.Output("Out"); - int index_num = context.Attr("index_num"); - int nshards = context.Attr("nshards"); - int shard_id = context.Attr("shard_id"); - int ignore_value = context.Attr("ignore_value"); - PADDLE_ENFORCE_GT( - index_num, 0, - platform::errors::InvalidArgument( - "The value 'index_num' for Op(shard_index) must be greater than 0, " - "but the value given is %d.", - index_num)); - PADDLE_ENFORCE_GT(nshards, 0, - platform::errors::InvalidArgument( - "The value 'nshard' for Op(shard_index) must be " - "greater than 0, but the value given is %d.", - nshards)); - PADDLE_ENFORCE_GE( - shard_id, 0, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be greater or " - "equal to 0, but the value given is %d.", - shard_id)); - PADDLE_ENFORCE_LT( - shard_id, nshards, - platform::errors::InvalidArgument( - "The value 'shard_id' for Op(shard_index) must be less than " - "nshards (%d), but the value given is %d.", - nshards, shard_id)); - - int shard_size = (index_num + nshards - 1) / nshards; - - out->Resize(in->dims()); - out->set_lod(in->lod()); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - int64_t numel = in->numel(); - for (int64_t i = 0; i < numel; ++i) { - PADDLE_ENFORCE_GE(in_data[i], 0, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be " - "greater or equal to 0, but the value given is %d.", - in_data[i])); - PADDLE_ENFORCE_LT(in_data[i], index_num, - platform::errors::InvalidArgument( - "The input_index for Op(shard_index) must be less " - "than index_num (%d), but the value given is %d.", - index_num, in_data[i])); - if (in_data[i] / shard_size == shard_id) { - out_data[i] = in_data[i] % shard_size; - } else { - out_data[i] = ignore_value; - } - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/shard_index_op_npu.cc b/paddle/fluid/operators/shard_index_op_npu.cc index dc2e8ad58f31ce8fe845ecb1f368544704e1d9ad..c875448424a24e686b9a6285725f801d604abc46 100644 --- a/paddle/fluid/operators/shard_index_op_npu.cc +++ b/paddle/fluid/operators/shard_index_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/shard_index_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 8e502fc04dbdb06386839d1ebe63c91dc392a2d0..016ff54645b02e9b3ddfb67595d830ccf5dcfd94 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -15,7 +15,10 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { @@ -26,46 +29,6 @@ const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", - "SigmoidCrossEntropyWithLogitsOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "SigmoidCrossEntropyWithLogitsOp"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ( - phi::slice_ddim(x_dims, 0, rank), - phi::slice_ddim(labels_dims, 0, rank), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same shape " - "except the last dimension. But received: the shape of " - "Input(X) is [%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", /*->*/ "Out"); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class SigmoidCrossEntropyWithLogitsGradOp @@ -201,12 +164,17 @@ DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DECLARE_INFER_SHAPE_FUNCTOR( + sigmoid_cross_entropy_with_logits, + SigmoidCrossEntropyWithLogitsInferShapeFunctor, + PD_INFER_META(phi::SigmoidCrossEntropyWithLogitsInferMeta)); REGISTER_OPERATOR( sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, ops::SigmoidCrossEntropyWithLogitsGradOpMaker, - ops::SigmoidCrossEntropyWithLogitsInplaceInferer); + ops::SigmoidCrossEntropyWithLogitsInplaceInferer, + SigmoidCrossEntropyWithLogitsInferShapeFunctor); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp, ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index 3bc55fafd81e18d0a986268ff4692129c6515edc..3148b31a8322e2bab39ad7f723ee59a6db64c204 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 956544c53609eb29326dc5cf295d978d767ac176..d61f5aa3f634cd2aee1e5c2f34f4467b1697e455 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc index 664f1031915e4661769d9b2844c5388f0efa91c0..fa8a5e92712ec86a01ca01b7eb644e289c03000a 100644 --- a/paddle/fluid/operators/take_along_axis_op.cc +++ b/paddle/fluid/operators/take_along_axis_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/take_along_axis_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/phi/core/ddim.h" @@ -139,16 +140,3 @@ REGISTER_OPERATOR(take_along_axis, ops::TakeAlongAxisOp, ops::TakeAlongAxisGradOpMaker); REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp); - -REGISTER_OP_CPU_KERNEL(take_along_axis, ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel, - ops::TakeAlongAxisOpKernel); - -REGISTER_OP_CPU_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel, - ops::TakeAlongAxisGradOpKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu deleted file mode 100644 index b6c62d497b379dda568f661b31366914e6870a7c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/take_along_axis_op.cu +++ /dev/null @@ -1,97 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/operators/take_along_axis_op.h" -#include "paddle/phi/core/ddim.h" - -namespace paddle { -namespace operators { - -template -class TakeAlongAxisCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet( - "This kernel only runs on GPU device.")); - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - gpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on GPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - - if (index_type == framework::proto::VarType::INT32) { - gpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - gpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(take_along_axis, ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel, - ops::TakeAlongAxisCUDAKernel); -REGISTER_OP_CUDA_KERNEL(take_along_axis_grad, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel, - ops::TakeAlongAxisGradOpCUDAKernel); diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h deleted file mode 100644 index fc781dbddf2ad25de3728e76d231d0164d46c08e..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/take_along_axis_op.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class TakeAlongAxisOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input = ctx.Input("Input"); - auto axis = ctx.Attr("Axis"); - auto index = ctx.Input("Index"); - auto result = ctx.Output("Result"); - result->Resize(index->dims()); - result->mutable_data(ctx.GetPlace()); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } else if (index_type == framework::proto::VarType::INT64) { - cpu_gather_kernel(*input, axis, *index, *result, - ctx.device_context()); - } - } -}; - -template -class TakeAlongAxisGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_cpu_place(ctx.GetPlace()), true, - platform::errors::PreconditionNotMet("This kernel only runs on CPU.")); - - auto input_grad = ctx.Output(framework::GradVarName("Input")); - auto index = ctx.Input("Index"); - auto result_grad = ctx.Input(framework::GradVarName("Result")); - auto axis = ctx.Attr("Axis"); - // We need to know the shape of input matrix to determine the shape of grad - // matrix of input. - auto input = ctx.Input("Input"); - input_grad->Resize(input->dims()); - input_grad->mutable_data(ctx.GetPlace()); - - // Set to zero tensor. - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant functor; - functor(reinterpret_cast(dev_ctx), - input_grad, static_cast(0)); - - const auto &index_type = framework::TransToProtoVarType(index->dtype()); - if (index_type == framework::proto::VarType::INT32) { - cpu_scatter_add_kernel( - *input_grad, axis, *index, *result_grad, - ctx.device_context()); // the gradient of gather is scatter - } else if (index_type == framework::proto::VarType::INT64) { - cpu_scatter_add_kernel(*input_grad, axis, *index, - *result_grad, ctx.device_context()); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index a7c7e33f58af6ce8f59a301d1fc5ccdf511b608f..1de1b590a1311b81f16ba05e746402e1fc14c556 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/common_infer_shape_functions.h" #include "paddle/phi/core/ddim.h" -USE_OP(relu); +USE_OP_ITSELF(relu); USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(softmax); diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index dc12f8e8892a022c6f55f4fe3a6237a7a01fa290..e179149c5bb77bd642f744be48109a941c66febf 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -12,11 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" + namespace paddle { namespace operators { @@ -26,66 +30,6 @@ class TileOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Tile"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Tile"); - auto x_dims = ctx->GetInputDim("X"); - auto repeat_times = ctx->Attrs().Get>("repeat_times"); - if (repeat_times.size() == 0) { - repeat_times = std::vector(x_dims.size(), -1); - } - - PADDLE_ENFORCE_LE( - x_dims.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, x_dims.size())); - PADDLE_ENFORCE_LE( - repeat_times.size(), MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must not be greater than %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times.size())); - PADDLE_ENFORCE_GE( - repeat_times.size(), 1, - platform::errors::InvalidArgument( - "The size of the shape of input 'repeat_times' for tile op " - "must be positive integers, but the value received is %d.", - repeat_times.size())); - - auto out_rank = - std::max(static_cast(x_dims.size()), repeat_times.size()); - std::vector out_shape(out_rank); - auto x_dim_vec = phi::vectorize(x_dims); - if (x_dim_vec.size() > repeat_times.size()) { - auto diff = x_dim_vec.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, -1); - } else { - auto diff = repeat_times.size() - x_dim_vec.size(); - x_dim_vec.insert(x_dim_vec.begin(), diff, -1); - } - for (size_t i = 0; i < repeat_times.size(); ++i) { - if (x_dim_vec[i] == -1 || repeat_times[i] == -1) { - out_shape[i] = -1; - } else { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "Every element of the input 'repeat_times' for tile op must be " - "greater than 0, but the value given is %d.", - repeat_times[i])); - out_shape[i] = x_dim_vec[i] * repeat_times[i]; - } - } - - ctx->SetOutputDim("Out", phi::make_ddim(out_shape)); - if (out_shape[0] == x_dims[0]) { - ctx->ShareLoD("X", "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -268,38 +212,15 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(TileGradNoNeedBufVarsInferer, "X"); } // namespace paddle namespace ops = paddle::operators; + +DECLARE_INFER_SHAPE_FUNCTOR(tile, TileInferMetaFunctor, + PD_INFER_META(phi::TileInferMeta)); + REGISTER_OPERATOR(tile, ops::TileOp, ops::TileOpMaker, ops::TileGradOpMaker, - ops::TileGradOpMaker); + ops::TileGradOpMaker, + TileInferMetaFunctor); REGISTER_OPERATOR(tile_grad, ops::TileGradOp, ops::TileDoubleGradOpMaker, ops::TileDoubleGradOpMaker, ops::TileGradNoNeedBufVarsInferer); -REGISTER_OP_CPU_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CPU_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -REGISTER_OP_CUDA_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CUDA_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); -#endif diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h deleted file mode 100644 index 1698b5e3c6322e2cd9cbe7cf4839e2fc08627b32..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tile_op.h +++ /dev/null @@ -1,306 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -#define MAX_RANK_SUPPORTED 6 - -namespace paddle { -namespace operators { -inline std::vector get_repeat_times( - const framework::ExecutionContext& ctx) { - if (ctx.HasInput("RepeatTimes")) { - auto* repeat_tensor = ctx.Input("RepeatTimes"); - auto* repeat_data = repeat_tensor->data(); - framework::Tensor cpu_repeat_tensor; - if (platform::is_gpu_place(repeat_tensor->place()) || - platform::is_xpu_place(repeat_tensor->place()) || - platform::is_npu_place(repeat_tensor->place())) { - paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), - &cpu_repeat_tensor); - repeat_data = cpu_repeat_tensor.data(); - } - auto vec_repeat_times = - std::vector(repeat_data, repeat_data + repeat_tensor->numel()); - return vec_repeat_times; - } - - auto list_repeat_times_tensor = - ctx.MultiInput("repeat_times_tensor"); - if (list_repeat_times_tensor.size() > 0) { - // get tensor from - std::vector vec_repeat_times; - for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { - auto tensor = list_repeat_times_tensor[i]; - if (platform::is_gpu_place(tensor->place()) || - platform::is_xpu_place(tensor->place()) || - platform::is_npu_place(tensor->place())) { - framework::Tensor temp; - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); - vec_repeat_times.push_back(*temp.data()); - } else { - vec_repeat_times.push_back(*tensor->data()); - } - } - return vec_repeat_times; - } else { - return ctx.Attr>("repeat_times"); - } -} - -using Tensor = framework::Tensor; -template -using EigenVector = framework::EigenVector; -template -using EigenTensor = framework::EigenTensor; -using framework::To32BitIndex; - -template -class TileKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto rank = context.Input("X")->dims().size(); - PADDLE_ENFORCE_GE( - rank, 1, platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op must be a positive " - "integer, but the value received is %d.", - rank)); - PADDLE_ENFORCE_LE( - rank, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'x' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, rank)); - auto repeat_times = get_repeat_times(context); - int repeat_times_size = repeat_times.size(); - PADDLE_ENFORCE_GE( - repeat_times_size, 1, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile " - "op must be positive, but the value received is %d.", - repeat_times_size)); - PADDLE_ENFORCE_LE( - repeat_times_size, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The number of elements of the input 'repeat_times' for tile op " - "must be less than or equal to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, repeat_times_size)); - rank = std::max(rank, repeat_times_size); - switch (rank) { - case 1: - Tile<1>(context); - break; - case 2: - Tile<2>(context); - break; - case 3: - Tile<3>(context); - break; - case 4: - Tile<4>(context); - break; - case 5: - Tile<5>(context); - break; - case 6: - Tile<6>(context); - break; - } - } - - protected: - template - void Tile(const framework::ExecutionContext& context) const { - auto* in0 = context.Input("X"); - - auto in_dims = in0->dims(); - auto repeat_times = get_repeat_times(context); - for (size_t i = 0; i < repeat_times.size(); ++i) { - PADDLE_ENFORCE_GT( - repeat_times[i], 0, - platform::errors::InvalidArgument( - "All elements of the input 'repeat_times' for tile op must " - "be positive integers, but the value received is %d.", - repeat_times[i])); - } - auto vec_in_dims = phi::vectorize(in_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - PADDLE_ENFORCE_EQ( - repeat_times.size(), vec_in_dims.size(), - platform::errors::InvalidArgument( - "The rank (%d) of the input 'x' and the rank (%d) of the input " - "'repeat_times' for tile op must match after promotion.", - vec_in_dims.size(), repeat_times.size())); - auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; - for (size_t i = 0; i < repeat_times.size(); ++i) { - bcast_dims[i] = repeat_times[i]; - } - - framework::DDim new_in_dims = phi::make_ddim(vec_in_dims); - framework::DDim out_dims(new_in_dims); - for (size_t i = 0; i < repeat_times.size(); ++i) { - out_dims[i] *= repeat_times[i]; - } - - out0->Resize(out_dims); - auto x = EigenTensor::From(*in0, new_in_dims); - out0->mutable_data(context.GetPlace()); - auto y = EigenTensor::From(*out0, out_dims); - auto& place = - *context.template device_context().eigen_device(); - // use 32-bit index to speed up - bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); - if (use_32bit_index) { - EigenBroadcast, T, Rank>::Eval( - place, To32BitIndex(y), To32BitIndex(x), bcast_dims); - } else { - EigenBroadcast, T, Rank>::Eval(place, y, x, - bcast_dims); - } - } -}; - -template -class TileGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto repeat_times = get_repeat_times(context); - auto x_dims = x->dims(); - auto vec_in_dims = phi::vectorize(x_dims); - if (repeat_times.size() < vec_in_dims.size()) { - int diff = vec_in_dims.size() - repeat_times.size(); - repeat_times.insert(repeat_times.begin(), diff, 1); - } else { - int diff = repeat_times.size() - vec_in_dims.size(); - vec_in_dims.insert(vec_in_dims.begin(), diff, 1); - } - // 1. reshape_dims_vec is the broadcast parameter. - // 2. reduce_dims_vec is the dimension parameter to compute gradients. For - // each dimension expanded, the gradients should be summed to original - // size. - std::vector reshape_dims_vec; - std::vector reduce_dims_vec; - for (size_t i = 0; i < repeat_times.size(); ++i) { - reduce_dims_vec.push_back(reshape_dims_vec.size()); - reshape_dims_vec.push_back(repeat_times[i]); - reshape_dims_vec.push_back(vec_in_dims[i]); - } - - int dims = reduce_dims_vec.size(); - - bool just_copy = true; - for (size_t i = 0; i < repeat_times.size(); i++) { - if (repeat_times[i] != 1) { - just_copy = false; - break; - } - } - // no need reduce, just copy - if (just_copy) { - auto* dout = context.Input(framework::GradVarName("Out")); - auto* dx = context.Output(framework::GradVarName("X")); - dx->mutable_data(context.GetPlace()); - framework::TensorCopy(*dout, context.GetPlace(), context.device_context(), - dx); - // TensorCopy may change the dims of dx - dx->Resize(x_dims); - } else { - PADDLE_ENFORCE_GE(dims, 1, - platform::errors::InvalidArgument( - "Th rank of the input 'Out@GRAD' for tile_grad op " - " must be greater than or equal to 1, but " - "the value received is %d.", - dims)); - PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, - platform::errors::InvalidArgument( - "The rank of the input 'Out@GRAD' for tile_grad op " - "must be less than or equal " - "to %d, but the value received is %d.", - MAX_RANK_SUPPORTED, dims)); - switch (dims) { - case 1: - TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 2: - TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 3: - TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 4: - TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 5: - TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec); - break; - case 6: - TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec); - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor with rank being between 1 and 6. But " - "received tensor's rank = %d.", - dims)); - } - } - } - - protected: - template - void TileBackward(const framework::ExecutionContext& context, - const std::vector& reshape_dims_vec, - const std::vector& reduce_dims_vec) const { - size_t reshape_size = reshape_dims_vec.size(); - size_t reduce_size = reduce_dims_vec.size(); - auto* in0 = context.Input(framework::GradVarName("Out")); - auto* out0 = context.Output(framework::GradVarName("X")); - out0->mutable_data(context.GetPlace()); - auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; - for (size_t i = 0; i < reshape_size; ++i) { - reshape_dims[i] = reshape_dims_vec[i]; - } - Eigen::DSizes reduce_dims; - for (size_t i = 0; i < reduce_size; ++i) { - reduce_dims[i] = reduce_dims_vec[i]; - } - - auto out_grad = EigenVector::Flatten(*in0); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcastGrad, T, Dims>::Eval( - place, x_grad, out_grad, reduce_dims, reshape_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_functor.h b/paddle/fluid/operators/tile_op_functor.h new file mode 100644 index 0000000000000000000000000000000000000000..95bfb9f4e1a9d374c66997567f5d80df8b5d8701 --- /dev/null +++ b/paddle/fluid/operators/tile_op_functor.h @@ -0,0 +1,67 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/operator.h" + +#define MAX_RANK_SUPPORTED 6 + +namespace paddle { +namespace operators { + +inline std::vector get_repeat_times( + const framework::ExecutionContext& ctx) { + if (ctx.HasInput("RepeatTimes")) { + auto* repeat_tensor = ctx.Input("RepeatTimes"); + auto* repeat_data = repeat_tensor->data(); + framework::Tensor cpu_repeat_tensor; + if (platform::is_gpu_place(repeat_tensor->place()) || + platform::is_xpu_place(repeat_tensor->place()) || + platform::is_npu_place(repeat_tensor->place())) { + paddle::framework::TensorCopySync(*repeat_tensor, platform::CPUPlace(), + &cpu_repeat_tensor); + repeat_data = cpu_repeat_tensor.data(); + } + auto vec_repeat_times = + std::vector(repeat_data, repeat_data + repeat_tensor->numel()); + return vec_repeat_times; + } + + auto list_repeat_times_tensor = + ctx.MultiInput("repeat_times_tensor"); + if (list_repeat_times_tensor.size() > 0) { + // get tensor from + std::vector vec_repeat_times; + for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { + auto tensor = list_repeat_times_tensor[i]; + if (platform::is_gpu_place(tensor->place()) || + platform::is_xpu_place(tensor->place()) || + platform::is_npu_place(tensor->place())) { + framework::Tensor temp; + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), &temp); + vec_repeat_times.push_back(*temp.data()); + } else { + vec_repeat_times.push_back(*tensor->data()); + } + } + return vec_repeat_times; + } else { + return ctx.Attr>("repeat_times"); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index 9e306c7be537bc7403812f4907541e1a9671c12a..cea6b458aec782923722cb37fe41c1c4d59292e5 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -11,7 +11,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/tile_op_xpu.cc b/paddle/fluid/operators/tile_op_xpu.cc index 6b60b167a2465fcb03d8ec088cfa288f9fb14af1..598377587d6f73e0c21abbc4d3819d16eacb1f23 100644 --- a/paddle/fluid/operators/tile_op_xpu.cc +++ b/paddle/fluid/operators/tile_op_xpu.cc @@ -11,11 +11,14 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/tile_op_functor.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; + template class TileXPUKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index d60976928e00cb5ecfde6ca65e0a1b0d5b1ef938..80c9935057cb5d5809fde545bdd0772afdaf2702 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -51,6 +51,19 @@ namespace operators { using Tensor = framework::Tensor; +inline void GetDims(const phi::DDim& dim, int axis, int* pre, int* n, + int* post) { + *pre = 1; + *post = 1; + *n = dim[axis]; + for (int i = 0; i < axis; ++i) { + (*pre) *= dim[i]; + } + for (int i = axis + 1; i < dim.size(); ++i) { + (*post) *= dim[i]; + } +} + struct SegmentOffsetIter { EIGEN_DEVICE_FUNC explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} diff --git a/paddle/fluid/operators/top_k_v2_op.cc b/paddle/fluid/operators/top_k_v2_op.cc index 810afc901df57bfa3c518b2363fb9153ee353762..d1add111e1d24cb711955a9aff06eb19feb35dc9 100644 --- a/paddle/fluid/operators/top_k_v2_op.cc +++ b/paddle/fluid/operators/top_k_v2_op.cc @@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include +#include "paddle/fluid/framework/op_registry.h" + namespace paddle { namespace operators { @@ -173,15 +174,3 @@ REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, ops::TopkV2GradOpMaker); REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); - -REGISTER_OP_CPU_KERNEL(top_k_v2, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel, - ops::TopkV2Kernel) - -REGISTER_OP_CPU_KERNEL( - top_k_v2_grad, ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel, - ops::TopkV2GradKernel) diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu deleted file mode 100644 index 84d8eef53bf72c5dbd5404a889925541374c9823..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_v2_op.cu +++ /dev/null @@ -1,296 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_function_cuda.h" -#include "paddle/fluid/operators/top_k_v2_op.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -#define FIXED_BLOCK_DIM_BASE(dim, ...) \ - case (dim): { \ - constexpr auto kBlockDim = (dim); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM(...) \ - FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) - -template -class TopkV2OpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); - - // get the attributes - int k = static_cast(ctx.Attr("k")); - int axis = static_cast(ctx.Attr("axis")); - const bool& sorted = static_cast(ctx.Attr("sorted")); - const bool& largest = static_cast(ctx.Attr("largest")); - - // get the input dims - const auto& in_dims = input->dims(); - // calcluate the real axis - if (axis < 0) axis += in_dims.size(); - - auto* k_t = ctx.Input("K"); - if (k_t) { - Tensor k_host; - framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); - k = k_host.data()[0]; - framework::DDim output_dims = output->dims(); - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - const auto& out_dims = output->dims(); - - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); - - if (axis == in_dims.size() - 1) { - // if get the topK from the last axis - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - const auto& dev_ctx = ctx.cuda_device_context(); - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, input, input_width, input_height, k, output, - indices, largest)) { - // Successed, return. - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - // NOTE: pass lds and dim same to input width. - // NOTE: old matrix implementation of stride is different to eigen. - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - output_data, k, indices_data, input_data, input_width, - input_width, static_cast(k), gridx, input_height, - largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - } else { - // if get topK not from the last axis, will tranpose the tensor and get - // TopK - - // first step, prepare the trans args for the tranpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (int i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - trans_out_dims[i] = out_dims[trans[i]]; - } - // second step, tranpose the input - Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - int ndims = trans.size(); - const auto& dev_ctx = ctx.cuda_device_context(); - TransCompute(ndims, dev_ctx, *input, - &trans_input, trans); - // third step, calcluate the topk - // allocate the tmp cuda memory for the tmp result - Tensor trans_ind; - trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); - Tensor trans_out; - trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - if (k > input_width) k = input_width; - - // The conclusion is drawn from the data through multiple sets of - // statistics - if (input_width >= 128 && k >= input_width * 0.75) { - if (SortTopk(dev_ctx, &trans_input, input_width, input_height, k, - &trans_out, &trans_ind, largest)) { - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute( - ndims, dev_ctx, trans_out, output, trans); - return; - } else { - LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " - "default topk kernel."; - } - } - - const int kMaxHeight = 2048; - int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; - switch (GetDesiredBlockDim(input_width)) { -#ifdef PADDLE_WITH_HIP - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#else - FIXED_BLOCK_DIM( - KeMatrixTopK<<>>( - trans_out.data(), k, trans_ind.data(), - trans_input.data(), input_width, input_width, - static_cast(k), gridx, input_height, largest)); -#endif - default: - PADDLE_THROW(platform::errors::Fatal( - "the input data shape has error in the topk cuda kernel.")); - } - - // last step, tranpose back the indices and output - TransCompute( - ndims, dev_ctx, trans_ind, indices, trans); - TransCompute(ndims, dev_ctx, trans_out, - output, trans); - } - } -}; - -#undef FIXED_BLOCK_DIM_BASE -#undef FIXED_BLOCK_DIM -template -class TopkV2OpGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(context.GetPlace()), true, - platform::errors::InvalidArgument( - "It must use CUDAPlace, you must check your device set.")); - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = context.Attr("axis"); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // get the real the axis and the k - if (axis < 0) axis += in_dims.size(); - const int& k = out_dims[axis]; - const int& raw_height = in_dims[axis]; - - // allocate the cuda memory for the x_grad - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - const T* out_grad_data = out_grad->data(); - const int64_t* indices_data = indices->data(); - - int pre, n, post; - GetDims(in_dims, axis, &pre, &n, &post); - - // calcluate the block and grid num - auto& dev_ctx = context.cuda_device_context(); - auto ComputeBlockSize = [](int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - int block_size = ComputeBlockSize(post * k); - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); - int grid_size = std::min(max_blocks, pre); - - // lanuch the cuda kernel to assign the grad - AssignGradWithAxis<<>>( - out_grad_data, indices_data, x_grad_data, pre, post, n, k); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - top_k_v2, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel, - paddle::operators::TopkV2OpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, float>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, double>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, int64_t>, - paddle::operators::TopkV2OpGradCUDAKernel< - paddle::platform::CUDADeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h deleted file mode 100644 index a808207476f3b9be2636741d7b0ac06002ccba08..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/top_k_v2_op.h +++ /dev/null @@ -1,335 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -/* - The reason why we need the topk v2 is because the compatibility. We redefine - the NaN is maximum value - in the process of comparing. If do not add the topk v2, will affect the - inference result of model that traing - by the older version paddlepaddle. -*/ - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/operators/transpose_op.h" - -namespace paddle { -namespace operators { - -inline void GetDims(const framework::DDim& dim, int axis, int* pre, int* n, - int* post) { - *pre = 1; - *post = 1; - *n = dim[axis]; - for (int i = 0; i < axis; ++i) { - (*pre) *= dim[i]; - } - for (int i = axis + 1; i < dim.size(); ++i) { - (*post) *= dim[i]; - } -} - -template -static void FullTopK(Type input_height, Type input_width, int input_dim, - const framework::Tensor* input, T* t_out, Type* t_indices, - const int& k, const bool& largest, const bool& sorted) { - // when the k is small, will the partial sort - bool partial_sort_flag = (k * 64) < input_width; - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - // Eigen::DSizes flat2dims(input_height, input_width); - for (Type i = 0; i < input_height; ++i) { - std::vector> col_vec; - col_vec.reserve(input_width); - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(j), j)); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - for (Type j = 0; j < input_width; ++j) { - col_vec.emplace_back(std::pair(e_input(i, j), j)); - } - } - if (partial_sort_flag) { - std::partial_sort( - col_vec.begin(), col_vec.begin() + k, col_vec.end(), - [&largest](const std::pair& l, const std::pair& r) { - if (largest) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - } else { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - } - }); - } else { - // use the nth-element to get the K-larger or K-small element - if (largest) { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort(col_vec.begin(), col_vec.begin() + k - 1, - [&largest](const std::pair& l, - const std::pair& r) { - return (std::isnan(static_cast(l.first)) && - !std::isnan(static_cast(r.first))) || - (l.first > r.first); - }); - } - } else { - std::nth_element( - col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - // the nth-element will get the unorder elements, sort the element - if (sorted) { - std::sort( - col_vec.begin(), col_vec.begin() + k - 1, - [](const std::pair& l, const std::pair& r) { - return (!std::isnan(static_cast(l.first)) && - std::isnan(static_cast(r.first))) || - (l.first < r.first); - }); - } - } - } - for (Type j = 0; j < k; ++j) { - t_out[i * k + j] = col_vec[j].first; - t_indices[i * k + j] = col_vec[j].second; - } - } -} - -template -static void FullTopKAssign(const Type& input_height, const Type& input_width, - const int& input_dim, const framework::Tensor* input, - const framework::Tensor* indices, T* output_data, - const int& k) { -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (Type i = 0; i < input_height; ++i) { - if (input_dim == 1) { - auto e_input = framework::EigenVector::Flatten(*input); - auto e_indices = framework::EigenVector::Flatten(*indices); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(j)] = e_input(j); - } - } else { - auto e_input = framework::EigenMatrix::Reshape(*input, input_dim - 1); - auto e_indices = - framework::EigenMatrix::Reshape(*indices, input_dim - 1); - for (Type j = 0; j < k; ++j) { - output_data[i * input_width + e_indices(i, j)] = e_input(i, j); - } - } - } -} - -template -class TopkV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - // Get the top k elements of each row of input tensor - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto* indices = context.Output("Indices"); - const auto& in_dims = input->dims(); - int k = static_cast(context.Attr("k")); - const auto& sorted = static_cast(context.Attr("sorted")); - const auto& largest = static_cast(context.Attr("largest")); - - // axis < 0, cacluate the real axis - int axis = static_cast(context.Attr("axis")); - if (axis < 0) axis += in_dims.size(); - - // if K tensor is not null, will the use K tesnor as k - auto* k_t = context.Input("K"); - if (k_t) { - k = k_t->data()[0]; - framework::DDim output_dims = output->dims(); - // accroding to axis to set K value in the dim - output_dims[axis] = k; - output->Resize(output_dims); - indices->Resize(output_dims); - } - - T* output_data = output->mutable_data(context.GetPlace()); - int64_t* indices_data = indices->mutable_data(context.GetPlace()); - const auto& out_dims = output->dims(); - if (axis + 1 == in_dims.size()) { - const int64_t& input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t& input_width = in_dims[in_dims.size() - 1]; - FullTopK(input_height, input_width, in_dims.size(), input, - output_data, indices_data, k, largest, sorted); - } else { - // if the topk dims is not last dim, will tranpose and do topk - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - - // get the trans input_dims, out_dims - framework::DDim trans_dims(in_dims); - framework::DDim trans_out_dims(output->dims()); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - for (size_t i = 0; i < trans.size(); i++) { - trans_out_dims[i] = out_dims[trans[i]]; - } - - Tensor trans_inp; - trans_inp.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // transpose the input value - TransCompute(ndims, dev_context, *input, - &trans_inp, trans); - - const int64_t input_height = - phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t input_width = trans_dims[trans_dims.size() - 1]; - - // Allocate the temp tensor to the save the topk indices, values - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); - Tensor tmp_indices; - auto* t_ind = - tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); - - // get the TopK value - FullTopK(input_height, input_width, in_dims.size(), - &trans_inp, t_out, t_ind, k, largest, sorted); - // transpose back - TransCompute( - ndims, dev_context, tmp_indices, indices, trans); - TransCompute(ndims, dev_context, tmp_out, - output, trans); - } - } -}; - -template -class TopkV2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x = context.Input("X"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* indices = context.Input("Indices"); - auto* x_grad = context.Output(framework::GradVarName("X")); - int axis = static_cast(context.Attr("axis")); - - const auto& in_dims = x->dims(); - const auto& out_dims = indices->dims(); - - // axis < 0, get the real axis - axis = (axis < 0) ? (in_dims.size() + axis) : axis; - const size_t& k = out_dims[axis]; - - T* x_grad_data = x_grad->mutable_data(context.GetPlace()); - if (axis + 1 == in_dims.size()) { - // allocate the memory for the input_grad - - // assign the out_grad to input_grad directly - const int64_t input_height = - phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t input_width = in_dims[in_dims.size() - 1]; - - // init the output grad with 0, because some input elements has no grad - memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); - // Assign the output_grad to input_grad - FullTopKAssign(input_height, input_width, in_dims.size(), out_grad, - indices, x_grad_data, k); - } else { - // can not assign grad to input_grad, must do the transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.emplace_back(i); - } - trans.emplace_back(out_dims.size() - 1); - for (int i = axis + 1; i < out_dims.size() - 1; i++) { - trans.emplace_back(i); - } - trans.emplace_back(axis); - framework::DDim trans_dims(out_dims); - framework::DDim trans_in_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = out_dims[trans[i]]; - trans_in_dims[i] = in_dims[trans[i]]; - } - // transpose the out_grad, indices - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, context.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, context.GetPlace()); - int ndims = trans.size(); - auto& dev_context = - context.template device_context(); - - // Do transpose - TransCompute(ndims, dev_context, *out_grad, - &trans_dO, trans); - TransCompute( - ndims, dev_context, *indices, &trans_ind, trans); - const int64_t input_height = phi::product( - phi::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); - const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; - - // Assign the out_grad to tranpose input_grad - Tensor tmp_out; - T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); - memset(t_out, 0, x_grad->numel() * sizeof(T)); - - FullTopKAssign(input_height, input_width, in_dims.size(), - &trans_dO, &trans_ind, t_out, k); - - // Transpose back - TransCompute(ndims, dev_context, tmp_out, - x_grad, trans); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc index 5b8a6b3e75449508afa5d316d81f97ab815c9ea9..caaae02124c926b9e4be08744e4192dab20ca5d0 100644 --- a/paddle/fluid/operators/top_k_v2_op_mlu.cc +++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/mlu/mlu_baseop.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc index e11070638834c46a6628d652216e1ddddeb2487d..dff5c2d3f39378486bb5d2f8010d005d57b20550 100644 --- a/paddle/fluid/operators/top_k_v2_op_npu.cc +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/top_k_v2_op.h" #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/top_k_v2_op_xpu.cc b/paddle/fluid/operators/top_k_v2_op_xpu.cc index 49daac2ff0da63c542a807dc97925c6989559f14..4d9c39be92eff029e66cdde900318b045c2b531f 100644 --- a/paddle/fluid/operators/top_k_v2_op_xpu.cc +++ b/paddle/fluid/operators/top_k_v2_op_xpu.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include -#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/transpose_op.h" #include "xpu/refactor/math.h" diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index 5617d728a51dc1c5e21053a2af05d062ecc1a22b..fb39034c8e92c1ac39aa1ca6e57d5a08ca1ca9d6 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/triangular_solve_op.h b/paddle/fluid/operators/triangular_solve_op.h index 315847b4d800e46aea6c927f9b7055261b56e9bc..fd46aca456cd9bd883cf9d1ce3576b307794b1a5 100644 --- a/paddle/fluid/operators/triangular_solve_op.h +++ b/paddle/fluid/operators/triangular_solve_op.h @@ -60,45 +60,5 @@ static void triangular_solve(const DeviceContext &context, const Tensor &x, unitriangular); } -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &input, Tensor *output, - const framework::ExecutionContext &ctx); -}; - -template -class MatrixReduceSumFunctor { - public: - void operator()(const Tensor &in, Tensor *out, - const framework::ExecutionContext &ctx) { - // For example: in's dim = [5, 3, 2, 7, 3] ; out's dim = [3, 1, 7, 3] - // out_reduce_dim should be [0, 2] - const std::vector in_dims = phi::vectorize(in.dims()); - auto in_size = in_dims.size(); - const std::vector out_dims = phi::vectorize(out->dims()); - auto out_size = out_dims.size(); - - std::vector out_bst_dims(in_size); - - std::fill(out_bst_dims.data(), out_bst_dims.data() + in_size - out_size, 1); - std::copy(out_dims.data(), out_dims.data() + out_size, - out_bst_dims.data() + in_size - out_size); - out->Resize(phi::make_ddim(out_bst_dims)); - - std::vector out_reduce_dims; - for (size_t idx = 0; idx <= in_size - 3; idx++) { - if (in_dims[idx] != 1 && out_bst_dims[idx] == 1) { - out_reduce_dims.push_back(idx); - } - } - - ReduceKernelFunctor( - &in, out, out_reduce_dims, true, false, ctx) - .template apply(); - out->Resize(phi::make_ddim(out_dims)); - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/tril_triu_op_xpu.cc b/paddle/fluid/operators/tril_triu_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e36cbcf228cfbf30c8fcd5562ac40f38a5467cdb --- /dev/null +++ b/paddle/fluid/operators/tril_triu_op_xpu.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under +the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/tril_triu_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TrilTriuXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* x = context.Input("X"); + const auto* x_data = x->data(); + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + + const int diagonal = context.Attr("diagonal"); + const bool lower = context.Attr("lower"); + auto xshape = phi::vectorize(x->dims()); + auto& dev_ctx = context.template device_context(); + int r = 0; + if (lower) { + r = xpu::tril(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "tril_op"); + } else { + r = xpu::triu(dev_ctx.x_context(), x_data, out_data, xshape, diagonal); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "triu_op"); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + tril_triu, ops::TrilTriuXPUKernel, + ops::TrilTriuXPUKernel); +#endif diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index 6eb7f922dfdbec41aa1c47d11e1decc259d08689..dc5a66dce16d698f9cfac01e3bdc776d08c2af19 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -17,8 +17,10 @@ limitations under the License. */ #include #include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/truncated_gaussian_random_op.h" +#include "paddle/phi/infermeta/nullary.h" namespace paddle { namespace operators { @@ -27,26 +29,6 @@ class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of TruncatedGaussianRandomOp should not be null.")); - auto shape = ctx->Attrs().Get>("shape"); - std::vector out_dim; - out_dim.reserve(shape.size()); - for (auto dim : shape) { - out_dim.push_back(static_cast(dim)); - } - PADDLE_ENFORCE_GT( - shape.size(), 0UL, - platform::errors::InvalidArgument( - "the input shape of TruncatedGaussianRandomOp must be set, " - "But the rank of shape we received is %d", - shape.size())); - ctx->SetOutputDim("Out", phi::make_ddim(out_dim)); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -99,6 +81,14 @@ Used to initialize tensors with truncated gaussian random generator. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, - ops::TruncatedGaussianRandomOp, - ops::TruncatedGaussianRandomOpMaker); + +DECLARE_INFER_SHAPE_FUNCTOR( + truncated_gaussian_random, TruncatedGaussianRandomInferShapeFunctor, + PD_INFER_META(phi::TruncatedGaussianRandomInferMeta)); + +REGISTER_OPERATOR( + truncated_gaussian_random, ops::TruncatedGaussianRandomOp, + ops::TruncatedGaussianRandomOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + TruncatedGaussianRandomInferShapeFunctor); diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index 5ab2004617810b34276632fa487e8f12d7b3b915..1be8f3387dbad85e0dce3593ad61b9c116b10ef0 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -236,7 +236,6 @@ register_unity_group(cc scatter_nd_add_op.cc scatter_op.cc seed_op.cc - segment_pool_op.cc select_input_op.cc select_output_op.cc) register_unity_group(cc @@ -496,8 +495,7 @@ register_unity_group(cu scale_op.cu scatter_nd_add_op.cu scatter_op.cu - seed_op.cu - segment_pool_op.cu) + seed_op.cu) register_unity_group(cu roi_pool_op.cu selu_op.cu diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index 3e11c952d15f3460f987f6fa2cb28970f97cc96b..a8ced783744a961eb8ce64983de7e9615763c1b6 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -24,7 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc index bf1cdeed65a8427c19410347209faa099673cb7c..602376d54e0d2a49b6cf4f6a78d332154c188a7e 100644 --- a/paddle/fluid/operators/viterbi_decode_op.cc +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" namespace paddle { namespace operators { @@ -19,47 +21,6 @@ class ViterbiDecodeOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", - "ViterbiDecode"); - OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); - auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE_EQ(in_dims.size(), 3, - platform::errors::InvalidArgument( - "The rank of Input in ViterbiDecode must be 3. But " - "received Input's rank is %d.", - in_dims.size())); - auto length_dims = ctx->GetInputDim("Length"); - PADDLE_ENFORCE_EQ(length_dims.size(), 1, - platform::errors::InvalidArgument( - "The rank of Length in ViterbiDecode must be 1. But " - "received Length's rank is %d.", - length_dims.size())); - auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ( - transition_dims.size(), 2, - platform::errors::InvalidArgument( - "The rank of Transition in ViterbiDecode must be 2. But " - "received Transition's rank is %d.", - transition_dims.size())); - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ( - in_dims[0], length_dims[0], - platform::errors::InvalidArgument( - "The batch size of Input and Length should be equal.")); - PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], - platform::errors::InvalidArgument( - "The number of tags of Input (%d) and Transition " - "(%d) should be equal.", - transition_dims[0], in_dims[2])); - } - ctx->SetOutputDim("Scores", length_dims); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -102,8 +63,8 @@ class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { namespace ops = paddle::operators; namespace platform = paddle::platform; +DECLARE_INFER_SHAPE_FUNCTOR(viterbi_decode, ViterbiDecodeInferShapeFunctor, + PD_INFER_META(phi::ViterbiDecodeInferMeta)); REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, - ops::ViterbiDecodeOpMaker); -REGISTER_OP_CPU_KERNEL( - viterbi_decode, ops::ViterbiDecodeKernel, - ops::ViterbiDecodeKernel); + ops::ViterbiDecodeOpMaker, + ViterbiDecodeInferShapeFunctor); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu deleted file mode 100644 index 68628fb2748c424996e7f0ae24594ff04649f8d6..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/viterbi_decode_op.cu +++ /dev/null @@ -1,206 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_functor.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/viterbi_decode_op.h" -#include "paddle/phi/kernels/funcs/gather.cu.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -namespace paddle { -namespace operators { - -#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ - case (1 << (log2_block_dim)): { \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } break - -#define FIXED_BLOCK_DIM_CASE(...) \ - FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ - FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); - -int64_t ComputeBlockSize(int64_t col) { - if (col > 512) - return 1024; - else if (col > 256) - return 512; - else if (col > 128) - return 256; - else if (col > 64) - return 128; - else if (col > 32) - return 64; - else if (col > 16) - return 32; - else if (col > 8) - return 16; - else - return 8; -} - -template