diff --git a/AUTHORS.md b/AUTHORS.md index 71d028fac369150ad7e8c0e78b5099b47abb56ee..1eaaff297714364d14a5463fb730d84761c8d18f 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -1,6 +1,9 @@ | Github account | name | |---|---| | abhinavarora | Abhinav Arora | +| andreazanetti | Andrea Zanetti | +| arlesniak | Artur Lesniak | +| arogowie-intel | Adam Osewski | | backyes | Yan-Fei Wang | | baiyfbupt | Yi-Fan Bai | | beckett1124 | Bin Qi | @@ -8,6 +11,7 @@ | chengxiaohua1105 | Xiao-Hua Cheng | | cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | | cxysteven | Xing-Yi Cheng | +| ddokupil | Dariusz Dokupil | | dzhwinter | Zhi-Hong Dong | | dragonwarrior | Long Wang | | dyning | Yuning Du | @@ -21,6 +25,7 @@ | hedaoyuan | Dao-Yuan He | | helinwang | He-Lin Wang | | jacquesqiao | Long-Fei Qiao | +| jakpiase | Jakub Piasecki | | [jczaja](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Jacek Czaja | | JiayiFeng | Jia-Yi Feng | | kbinias | Krzysztof Binias | @@ -42,6 +47,7 @@ | pakchoi | Chuan-Jiang Song | | panyx0718 | Xin Pan | | pengli09 | Peng Li | +| pmajchrzak |Piotr Majchrzak | | pkuyym | Ya-Ming Yang | | pzelazko-intel | Pawel Zelazko | | [pawelpiotrowicz](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Pawel Piotrowicz | @@ -72,3 +78,6 @@ | zhaopu7 | Pu Zhao | | zhouxiao-coder | Xiao Zhou | | Zrachel | Rui-Qing Zhang | +| jeng1220 | Bai-Cheng(Ryan) Jeng (NVIDIA) | +| mingxu1067 | Ming Huang (NVIDIA) | +| zlsh80826 | Reese Wang (NVIDIA) | diff --git a/CMakeLists.txt b/CMakeLists.txt index d874b21b0873da47d5acd5ef6a78bfe7fd7ce2e1..50070c7fc05133da758650eb5ac50e32effe63c9 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,14 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.10) -cmake_policy(VERSION 3.10) +if(APPLE AND WITH_ARM) + # cmake 3.19.2 version starts to support M1 + cmake_minimum_required(VERSION 3.19.2) + cmake_policy(VERSION 3.19.2) +else(APPLE AND WITH_ARM) + cmake_minimum_required(VERSION 3.10) + cmake_policy(VERSION 3.10) +endif(APPLE AND WITH_ARM) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) include(system) +# Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + project(paddle CXX C) # enable language CUDA @@ -66,6 +79,11 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() +if(APPLE AND WITH_ARM) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") + set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin") +endif() + if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() @@ -90,10 +108,6 @@ if(WIN32) if (MSVC_STATIC_CRT) message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019") - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO @@ -105,9 +119,7 @@ if(WIN32) endforeach(flag_var) endif() - # NOTE(zhouwei25): temporarily change MP to 1 for reducing CPU & memory utilization - set(PROCESS_MAX 1) - #math(EXPR PROCESS_MAX "${CPU_CORES} * 1 / 2") + math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") # windows build turn off warnings, use parallel compiling. foreach(flag_var @@ -116,7 +128,10 @@ if(WIN32) CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") - set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") + # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling + if(NOT WITH_GPU) + set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") + endif() endforeach(flag_var) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) set(${flag_var} "${${flag_var}} /w") @@ -208,16 +223,10 @@ option(WITH_STRIP "Strip so files of Whl packages" OFF) # PY_VERSION if(NOT PY_VERSION) - set(PY_VERSION 2.7) + set(PY_VERSION 3.6) endif() set(PYBIND11_PYTHON_VERSION ${PY_VERSION}) -# CMAKE_BUILD_TYPE -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Release" CACHE STRING - "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" - FORCE) -endif() # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$") @@ -282,6 +291,27 @@ if(WITH_GPU) endif() endif() +if(WITH_ROCM) + include(hip) + include(miopen) # set miopen libraries, must before configure +endif(WITH_ROCM) + +if (NOT WITH_ROCM AND WITH_RCCL) + MESSAGE(WARNING + "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") + set(WITH_RCCL OFF CACHE STRING + "Disable RCCL when compiling without ROCM" FORCE) +endif() + +if(WITH_RCCL) + add_definitions("-DPADDLE_WITH_RCCL") + include(rccl) +else() + if(WITH_ROCM) + MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.") + endif() +endif() + include(third_party) # download, build, install third_party, Contains about 20+ dependencies include(flags) # set paddle compile flags @@ -306,26 +336,6 @@ include(configure) # add paddle env configuration include_directories("${PADDLE_SOURCE_DIR}") -if(WITH_ROCM) - include(hip) -endif(WITH_ROCM) - -if (NOT WITH_ROCM AND WITH_RCCL) - MESSAGE(WARNING - "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.") - set(WITH_RCCL OFF CACHE STRING - "Disable RCCL when compiling without ROCM" FORCE) -endif() - -if(WITH_RCCL) - add_definitions("-DPADDLE_WITH_RCCL") - include(rccl) -else() - if(WITH_ROCM) - MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.") - endif() -endif() - if(WITH_NV_JETSON) set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE) endif() @@ -333,8 +343,9 @@ endif() if(WITH_ARM) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") - set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE) + set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE) + set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE) add_definitions(-DPADDLE_WITH_ARM) endif() @@ -352,6 +363,11 @@ if (WITH_MIPS) add_definitions(-DPADDLE_WITH_MIPS) endif() +if (WITH_HETERPS) + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new") + endif() +endif() set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") diff --git a/README.md b/README.md index e8a7013d0b4432bc871843b83cf19494ca870cbc..6b3f3ef86fe1bc38483789d85b101143fc723ded 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ -

@@ -22,7 +21,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm ## Installation -### Latest PaddlePaddle Release: [v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0) +### Latest PaddlePaddle Release: [v2.1](https://github.com/PaddlePaddle/Paddle/tree/release/2.1) Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle. @@ -34,9 +33,9 @@ pip install paddlepaddle pip install paddlepaddle-gpu ``` -More infomation about installation, please view [Quick Install](https://www.paddlepaddle.org.cn/install/quick) +For more information about installation, please view [Quick Install](https://www.paddlepaddle.org.cn/install/quick) -Now our developers can acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you will obtain 10 hours to train models online per day. [Click here to start](https://aistudio.baidu.com/aistudio/index). +Now our developers can acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you will obtain 8 hours to train models online per day. [Click here to start](https://aistudio.baidu.com/aistudio/index). ## FOUR LEADING TECHNOLOGIES @@ -47,14 +46,13 @@ Now our developers can acquire Tesla V100 online computing resources for free. I - **Support Ultra-Large-Scale Training of Deep Neural Networks** - PaddlePaddle has made breakthroughs in ultra-large-scale deep neural networks training. It launched the world's first large-scale open-source training platform that supports the training of deep networks with 100 billions of features and trillions of parameters using data sources distributed over hundreds of nodes. PaddlePaddle overcomes the online deep learning challenges for ultra-large-scale deep learning models, and further achieved the real-time model updating with more than 1 trillion parameters. + PaddlePaddle has made breakthroughs in ultra-large-scale deep neural networks training. It launched the world's first large-scale open-source training platform that supports the training of deep networks with 100 billion features and trillions of parameters using data sources distributed over hundreds of nodes. PaddlePaddle overcomes the online deep learning challenges for ultra-large-scale deep learning models, and further achieved real-time model updating with more than 1 trillion parameters. [Click here to learn more](https://github.com/PaddlePaddle/Fleet) -- **Accelerated High-Performance Inference over Ubiquitous Deployments** +- **High-Performance Inference Engines for Comprehensive Deployment Enviroments** - PaddlePaddle is not only compatible with other open-source frameworks for models training, but also works well on the ubiquitous developments, varying from platforms to devices. More specifically, PaddlePaddle accelerates the inference procedure with the fastest speed-up. Note that, a recent breakthrough of inference speed has been made by PaddlePaddle on Huawei's Kirin NPU, through the hardware/software co-optimization. - [Click here to learn more](https://github.com/PaddlePaddle/Paddle-Lite) + PaddlePaddle is not only compatible with models trained in 3rd party open-source frameworks , but also offers complete inference products for various production scenarios. Our inference product line includes [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html): Native inference library for high-performance server and cloud inference; [Paddle Serving](https://github.com/PaddlePaddle/Serving): A service-oriented framework suitable for distributed and pipeline productions; [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite): Ultra-Lightweight inference engine for mobile and IoT environments; [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs): A frontend inference engine for browser and mini-apps. Furthermore, by great amounts of optimization with leading hardware in each scenario, Paddle inference engines outperform most of the other mainstream frameworks. - **Industry-Oriented Models and Libraries with Open Source Repositories** @@ -87,8 +85,13 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide ## Communication - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc. -- QQ discussion group: 778260830 (PaddlePaddle). +- QQ discussion group: 793866180 (PaddlePaddle). - [Forums](https://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. + +## Courses + +- [Server Deployments](https://aistudio.baidu.com/aistudio/course/introduce/19084): Courses intorducing high performance server deployments via local and remote services. +- [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): Courses intorducing edge deployments from mobile, IoT to web and applets. ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). diff --git a/README_cn.md b/README_cn.md index 7a10cba2845498d2299fc516f5804eb1a84e4ecc..cc8afde7dd266262c321c8277c88e6420716d7f6 100644 --- a/README_cn.md +++ b/README_cn.md @@ -1,4 +1,4 @@ - +

@@ -19,7 +19,7 @@ ## 安装 -### PaddlePaddle最新版本: [v2.0](https://github.com/PaddlePaddle/Paddle/tree/release/2.0) +### PaddlePaddle最新版本: [v2.1](https://github.com/PaddlePaddle/Paddle/tree/release/2.1) 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) @@ -32,7 +32,7 @@ pip install paddlepaddle-gpu ``` 更多安装信息详见官网 [安装说明](https://www.paddlepaddle.org.cn/install/quick) -PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型更高效。**每日登陆即送10小时**,[前往使用免费算力](https://aistudio.baidu.com/aistudio/index)。 +PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型更高效。**每日登陆即送8小时**,[前往使用免费算力](https://aistudio.baidu.com/aistudio/index)。 ## 四大领先技术 @@ -47,10 +47,9 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型 [查看详情](https://github.com/PaddlePaddle/Fleet) -- **多端多平台部署的高性能推理引擎** +- **支持多端多平台的高性能推理部署工具** - 飞桨不仅兼容其他开源框架训练的模型,还可以轻松地部署到不同架构的平台设备上。同时,飞桨的推理速度也是全面领先的。尤其经过了跟华为麒麟NPU的软硬一体优化,使得飞桨在NPU上的推理速度进一步突破。 - [查看详情](https://github.com/PaddlePaddle/Paddle-Lite) + 飞桨不仅广泛兼容第三方开源框架训练的模型部署,并且为不同的场景的生产环境提供了完备的推理引擎,包括适用于高性能服务器及云端推理的原生推理库 [Paddle Inference](https://paddle-inference.readthedocs.io/en/latest/product_introduction/summary.html),面向分布式、流水线生产环境下自动上云、A/B测试等高阶功能的服务化推理框架 [Paddle Serving](https://github.com/PaddlePaddle/Serving),针对于移动端、物联网场景的轻量化推理引擎 [Paddle Lite](https://github.com/PaddlePaddle/Paddle-Lite),以及在浏览器、小程序等环境下使用的前端推理引擎 [Paddle.js](https://www.paddlepaddle.org.cn/paddle/paddlejs)。同时,透过与不同场景下的主流硬件高度适配优化及异构计算的支持, 飞桨的推理性能也领先绝大部分的主流实现。 - **面向产业应用,开源开放覆盖多领域的工业级模型库。** @@ -83,8 +82,13 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型 ## 交流与反馈 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议 -- QQ群: 778260830 (PaddlePaddle) +- QQ群: 793866180 (PaddlePaddle) - [论坛](https://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 + +## 课程 + +- [服务器部署](https://aistudio.baidu.com/aistudio/course/introduce/19084): 详细介绍高性能服务器端部署实操,包含本地端及服务化Serving部署等 +- [端侧部署](https://aistudio.baidu.com/aistudio/course/introduce/22690): 详细介绍端侧多场景部署实操,从移端端设备、IoT、网页到小程序部署 ## 版权和许可证 PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index 6056b53bc2218fb24d2e97b281b9a0d68bc9a306..69e66407580b62d52c941fee522bae7dbca23796 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -69,15 +69,21 @@ if(NOT DEFINED CBLAS_PROVIDER) PATHS ${OPENBLAS_LIB_SEARCH_PATHS}) if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB) - set(CBLAS_PROVIDER OPENBLAS) - set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR}) - set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) - - add_definitions(-DPADDLE_USE_OPENBLAS) - add_definitions(-DLAPACK_FOUND) - - message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") - message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") + file(READ "${OPENBLAS_INC_DIR}/openblas_config.h" config_file) + string(REGEX MATCH "OpenBLAS ([0-9]+\.[0-9]+\.[0-9]+)" tmp ${config_file}) + string(REGEX MATCH "([0-9]+\.[0-9]+\.[0-9]+)" ver ${tmp}) + + if (${ver} VERSION_GREATER_EQUAL "0.3.7") + set(CBLAS_PROVIDER OPENBLAS) + set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR}) + set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) + + add_definitions(-DPADDLE_USE_OPENBLAS) + add_definitions(-DLAPACK_FOUND) + + message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") + endif() endif() endif() diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake index 64f4f6c2a1c254d868b29bdcebf9840a54146d4a..25798758473af52dc66230ac70a7d750e78176de 100644 --- a/cmake/ccache.cmake +++ b/cmake/ccache.cmake @@ -1,14 +1,29 @@ # Use ccache if found ccache program -find_program(CCACHE_PATH ccache) +if(NOT WIN32) + find_program(CCACHE_PATH ccache) + if(CCACHE_PATH) + execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output) + execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory) + string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output}) + message(STATUS "ccache is founded, use ccache to speed up compile on Unix.") + # show statistics summary of ccache + message("ccache version\t\t\t " ${ccache_version} "\n" ${cache_directory}) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH}) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH}) + endif(CCACHE_PATH) +elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja") + # (Note:zhouwei25) Only Ninja Generator can support sccache now + find_program(SCCACHE_PATH sccache) -if(CCACHE_PATH) - execute_process(COMMAND ccache -V OUTPUT_VARIABLE ccache_output) - execute_process(COMMAND ccache -s cache directory OUTPUT_VARIABLE cache_directory) - string(REGEX MATCH "[0-9]+.[0-9]+" ccache_version ${ccache_output}) - message(STATUS "Ccache is founded, use ccache to speed up compile.") - # show statistics summary of ccache - message("ccache version\t\t\t " ${ccache_version} "\n" ${cache_directory}) - set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH}) - set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH}) -endif(CCACHE_PATH) + if(SCCACHE_PATH) + execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version) + message(STATUS "${sccache_version} is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.") + + set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH}) + set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH}) + # (Note:zhouwei25) sccache for cuda compiler has bug so that it can't be hit + # refer to https://github.com/mozilla/sccache/issues/1017, so we fix it + set(CMAKE_CUDA_COMPILER_LAUNCHER ${SCCACHE_PATH}) + endif(SCCACHE_PATH) +endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index e7f125269be1f5e015c6cf015489c312538ca4ba..458ab992c25f3818ae53b28fab38d9f986a36265 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -143,6 +143,14 @@ elseif(WITH_ROCM) add_definitions(-DPADDLE_WITH_HIP) add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_HIP) + + if(NOT MIOPEN_FOUND) + message(FATAL_ERROR "Paddle needs MIOpen to compile") + endif() + + if(${MIOPEN_VERSION} VERSION_LESS 2090) + message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile") + endif() else() add_definitions(-DHPPL_STUB_FUNC) list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 7f2addb02d36ddf85cd08542cc5baab31d495bc5..e1a9324650ac9c2c595ea7727354069080df10c1 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -95,11 +95,23 @@ function(select_nvcc_arch_flags out_variable) if(${CUDA_ARCH_NAME} STREQUAL "Kepler") set(cuda_arch_bin "30 35") elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") - set(cuda_arch_bin "50") + if (WITH_NV_JETSON) + set(cuda_arch_bin "53") + else() + set(cuda_arch_bin "50") + endif() elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") - set(cuda_arch_bin "60 61") + if (WITH_NV_JETSON) + set(cuda_arch_bin "62") + else() + set(cuda_arch_bin "60 61") + endif() elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") - set(cuda_arch_bin "70") + if (WITH_NV_JETSON) + set(cuda_arch_bin "72") + else() + set(cuda_arch_bin "70") + endif() elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") set(cuda_arch_bin "75") elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") @@ -205,26 +217,18 @@ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") if(WIN32) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"/wd4244 /wd4267 /wd4819 \"") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /bigobj") - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - # match the cl's _ITERATOR_DEBUG_LEVEL - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-g -G -D_DEBUG\"") - if(MSVC_STATIC_CRT) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MTd") - else() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MDd") - endif() - elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"-DNDEBUG\"") - if(MSVC_STATIC_CRT) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MT") - else() - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler /MD") - endif() - else() - message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") + if(MSVC_STATIC_CRT) + foreach(flag_var + CMAKE_CUDA_FLAGS CMAKE_CUDA_FLAGS_DEBUG CMAKE_CUDA_FLAGS_RELEASE + CMAKE_CUDA_FLAGS_MINSIZEREL CMAKE_CUDA_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "-MD") + string(REGEX REPLACE "-MD" "-MT" ${flag_var} "${${flag_var}}") + endif() + endforeach(flag_var) endif() endif() mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) +include(thrust) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index f14195480b7dc80df0566c9b09075797010fe289..d88d693d8286d1efab5242fb758331ef64663a4d 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -46,6 +46,7 @@ ExternalProject_Add( ${BOOST_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} "${BOOST_DOWNLOAD_CMD}" + URL_MD5 f891e8c2c9424f0565f0129ad9ab4aff PREFIX ${BOOST_PREFIX_DIR} DOWNLOAD_DIR ${BOOST_SOURCE_DIR} SOURCE_DIR ${BOOST_SOURCE_DIR} diff --git a/cmake/external/box_ps.cmake b/cmake/external/box_ps.cmake index adfc6dba1f083e11446401e6b5d5623db080f912..85e1f94fd2c67f2526a5201045caac724fd2250f 100644 --- a/cmake/external/box_ps.cmake +++ b/cmake/external/box_ps.cmake @@ -49,7 +49,10 @@ ExternalProject_Add( DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${BOX_PS_LIB} ) ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB}) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 2d72b6eb56deaa2547051756afc075a100aeb251..1a45cfa0a1e514aae83808aebf401c38efd825fd 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -45,23 +45,24 @@ ExternalProject_Add( PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_PREFIX_PATH=${prefix_path} - -DWITH_GLOG=ON - -DIOBUF_WITH_HUGE_BLOCK=ON - -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} - ${EXTERNAL_OPTIONAL_ARGS} - LIST_SEPARATOR | + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DWITH_GLOG=ON + -DIOBUF_WITH_HUGE_BLOCK=ON + -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${BRPC_LIBRARIES} ) # ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy) ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog snappy) diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake index a30164ada2791bd90529a34e4103a358854ccec6..aedd40aec68481e1a92924bcd484384ecdd87d88 100644 --- a/cmake/external/cryptopp.cmake +++ b/cmake/external/cryptopp.cmake @@ -33,6 +33,10 @@ ELSE(WIN32) SET(CRYPTOPP_LIBRARIES "${CRYPTOPP_INSTALL_DIR}/lib/libcryptopp.a" CACHE FILEPATH "cryptopp library." FORCE) ENDIF(WIN32) +IF(APPLE AND WITH_ARM) + SET(CMAKE_CXX_FLAGS "-DCRYPTOPP_ARM_CRC32_AVAILABLE=0") +ENDIF() + set(CRYPTOPP_CMAKE_ARGS ${COMMON_CMAKE_ARGS} -DBUILD_SHARED=ON -DBUILD_STATIC=ON @@ -72,6 +76,7 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CRYPTOPP_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${CRYPTOPP_LIBRARIES} ) ADD_LIBRARY(cryptopp STATIC IMPORTED GLOBAL) diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake index a26568860f42dae5cdcce0b1bf51d06b531608c6..f263086e8bef80864790e2c44474a45f072a3873 100644 --- a/cmake/external/cub.cmake +++ b/cmake/external/cub.cmake @@ -14,27 +14,27 @@ include(ExternalProject) -set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub) -set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub) -set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git) -set(CUB_TAG 1.8.0) +# Note(zhouwei): extern_cub has code __FILE_, If the path of extern_cub is changed, +# it will effect about 30+ cu files sccache hit and slow compile speed on windows. +# Therefore, a fixed CUB_PATH will be input to increase the sccache hit rate. +set(CUB_PATH "${THIRD_PARTY_PATH}/cub" CACHE STRING "A path setting for external_cub path.") +set(CUB_PREFIX_DIR ${CUB_PATH}) -cache_third_party(extern_cub - REPOSITORY ${CUB_REPOSITORY} - TAG ${CUB_TAG} - DIR CUB_SOURCE_DIR) +set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git) +set(CUB_TAG 1.8.0) -SET(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}) +SET(CUB_INCLUDE_DIR ${CUB_PREFIX_DIR}/src/extern_cub) +message("CUB_INCLUDE_DIR is ${CUB_INCLUDE_DIR}") include_directories(${CUB_INCLUDE_DIR}) ExternalProject_Add( extern_cub ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE} - "${CUB_DOWNLOAD_CMD}" + GIT_REPOSITORY ${CUB_REPOSITORY} + GIT_TAG ${CUB_TAG} PREFIX ${CUB_PREFIX_DIR} - SOURCE_DIR ${CUB_SOURCE_DIR} - UPDATE_COMMAND "" + UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake index bc8611f3862cd14c0de493564ea82a1c9ce66667..3c64e1ea11ecd65ab15e80147cd62b1cde371722 100644 --- a/cmake/external/dgc.cmake +++ b/cmake/external/dgc.cmake @@ -39,6 +39,7 @@ ExternalProject_Add( && cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES} && cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ BUILD_IN_SOURCE 1 + BUILD_BYPRODUCTS ${DGC_LIBRARIES} ) ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 4619f9f7b7e34c99f7fb3048a3eae9e9ffc0b5ac..aa471002eacb6a61a9cf835f293a86a75d87db8f 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -33,7 +33,9 @@ elseif(LINUX) # which will cause compiler error of using __host__ funciont in __host__ __device__ file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) - set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst}) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1) + set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1}) endif() endif() diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 576598b4ac6e3bc085f75465456be580db159005..8360761de6fb9869fec42fa40e87fd29e595650f 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -61,6 +61,7 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GFLAGS_LIBRARIES} ) ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 05b98e2b56a33a65315d1e4fb1c02c738f93b712..d2bb1e62e83de391272315d379619feca84c62bd 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -64,6 +64,7 @@ ExternalProject_Add( -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GLOG_LIBRARIES} ) ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index e8db13a694f5578e314dc1a7c95ed24ad88bad02..03e45e3e5c67b0118727a616f8cd0c013c621fe6 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -32,7 +32,7 @@ cache_third_party(extern_gloo TAG ${GLOO_TAG} DIR GLOO_SOURCE_DIR) - if(WITH_ASCEND OR WITH_ASCEND_CL) +if(WITH_ASCEND OR WITH_ASCEND_CL) ExternalProject_Add( extern_gloo ${EXTERNAL_PROJECT_LOG_ARGS} @@ -47,6 +47,7 @@ cache_third_party(extern_gloo && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + BUILD_BYPRODUCTS ${GLOO_LIBRARIES} ) else() ExternalProject_Add( @@ -63,6 +64,7 @@ else() && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + BUILD_BYPRODUCTS ${GLOO_LIBRARIES} ) endif() diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 3db12f084eb5a3519e529afe90a151b33823fe82..e7d4783a9593a7bac474adc089eaca543db7a600 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -79,6 +79,8 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${GTEST_LIBRARIES} + BUILD_BYPRODUCTS ${GTEST_MAIN_LIBRARIES} ) ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index 79dc403e67d5266fe618b997c08c75d4cc86b82b..c36f49d3bd354acabf3654b642fd24ba227470b8 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -33,6 +33,7 @@ ExternalProject_Add( && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/ BUILD_IN_SOURCE 1 + BUILD_BYPRODUCTS ${LEVELDB_LIBRARIES} ) ADD_DEPENDENCIES(extern_leveldb snappy) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake index c10a662485c2d172f408a7622e7f14d0b566f274..d318bc7d0f3c3fa99d68a502496423ffbc4c08a2 100644 --- a/cmake/external/libmct.cmake +++ b/cmake/external/libmct.cmake @@ -49,7 +49,9 @@ ExternalProject_Add( DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ) add_library(libmct INTERFACE) diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake index 0d09576286d907ec6964df69efb0efcf9885f57d..fae8154eb1cb0354683b8141eeb28a7bf5012cbe 100644 --- a/cmake/external/libxsmm.cmake +++ b/cmake/external/libxsmm.cmake @@ -18,8 +18,8 @@ SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm) SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE) SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE) -SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a" - "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") +SET(LIBXSMM_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") +SET(LIBXSMMNOBLAS_LIB "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") ExternalProject_Add( extern_libxsmm @@ -32,10 +32,12 @@ ExternalProject_Add( BUILD_IN_SOURCE 1 BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install INSTALL_COMMAND "" + BUILD_BYPRODUCTS ${LIBXSMM_LIB} + BUILD_BYPRODUCTS ${LIBXSMMNOBLAS_LIB} ) ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") -SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIB}") +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMMNOBLAS_LIB}") MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") include_directories(${LIBXSMM_INCLUDE_DIR}) diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index 6e2157e30871678a5f78ac35726805fb1c1f0466..e213068377b1409595cac9b6169fe7605cff059c 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -18,13 +18,21 @@ if(NOT LINUX) return() endif() -if(XPU_SDK_ROOT) - set(LITE_WITH_XPU ON) - include_directories("${XPU_SDK_ROOT}/XTDK/include") - include_directories("${XPU_SDK_ROOT}/XTCL/include") +if (LITE_WITH_XPU) add_definitions(-DLITE_SUBGRAPH_WITH_XPU) - LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/") - LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/") + IF(WITH_AARCH64) + SET(XPU_SDK_ENV "kylin_aarch64") + ELSEIF(WITH_SUNWAY) + SET(XPU_SDK_ENV "deepin_sw6_64") + ELSEIF(WITH_BDCENTOS) + SET(XPU_SDK_ENV "bdcentos_x86_64") + ELSEIF(WITH_UBUNTU) + SET(XPU_SDK_ENV "ubuntu_x86_64") + ELSEIF(WITH_CENTOS) + SET(XPU_SDK_ENV "centos7_x86_64") + ELSE () + SET(XPU_SDK_ENV "ubuntu_x86_64") + ENDIF() endif() if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) @@ -57,7 +65,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON -DLITE_WITH_XPU=${LITE_WITH_XPU} - -DXPU_SDK_ROOT=${XPU_SDK_ROOT} + -DXPU_SDK_URL=${XPU_BASE_URL} + -DXPU_SDK_ENV=${XPU_SDK_ENV} -DLITE_WITH_CODE_META_INFO=OFF -DLITE_WITH_ARM=ON) ExternalProject_Add( @@ -99,7 +108,8 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) -DLITE_WITH_STATIC_CUDA=OFF -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME} -DLITE_WITH_XPU=${LITE_WITH_XPU} - -DXPU_SDK_ROOT=${XPU_SDK_ROOT} + -DXPU_SDK_URL=${XPU_BASE_URL} + -DXPU_SDK_ENV=${XPU_SDK_ENV} -DLITE_WITH_CODE_META_INFO=OFF -DLITE_WITH_ARM=OFF) @@ -147,6 +157,10 @@ message(STATUS "Paddle-lite BINARY_DIR: ${LITE_BINARY_DIR}") message(STATUS "Paddle-lite SOURCE_DIR: ${LITE_SOURCE_DIR}") include_directories(${LITE_SOURCE_DIR}) include_directories(${LITE_BINARY_DIR}) +if(LITE_WITH_XPU) + include_directories(${LITE_BINARY_DIR}/third_party/install/xpu/xdnn/include/) + include_directories(${LITE_BINARY_DIR}/third_party/install/xpu/xre/include/) +endif() function(external_lite_libs alias path) add_library(${alias} SHARED IMPORTED GLOBAL) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index fb1d4d9d56dcc6f38a86242b4d78b88ef31ddaa0..9963237ff188cfc736520588fc462a4a7c8a1700 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,7 +20,8 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) -SET(MKLDNN_TAG f58682cd8bd0615f41d879f8afc8f1511ab42d24) +SET(MKLDNN_TAG 593e0de6267d2575f3e4c9e9818f0f11253d093a) + # Introduce variables: # * CMAKE_INSTALL_LIBDIR @@ -42,8 +43,10 @@ IF(NOT WIN32) SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so" CACHE FILEPATH "mkldnn library." FORCE) ELSE() SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) ENDIF(NOT WIN32) cache_third_party(${MKLDNN_PROJECT} @@ -59,8 +62,8 @@ ExternalProject_Add( DEPENDS ${MKLDNN_DEPENDS} PREFIX ${MKLDNN_PREFIX_DIR} SOURCE_DIR ${MKLDNN_SOURCE_DIR} - BUILD_ALWAYS 1 - # UPDATE_COMMAND "" + UPDATE_COMMAND "" + #BUILD_ALWAYS 1 CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} @@ -76,12 +79,8 @@ ExternalProject_Add( -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} + BUILD_BYPRODUCTS ${MKLDNN_LIB} ) -if(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) -else(WIN32) - SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libdnnl.so" CACHE FILEPATH "mkldnn library." FORCE) -endif(WIN32) ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) @@ -100,8 +99,11 @@ ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) # it can be directly contained in wheel or capi if(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) + + file(TO_NATIVE_PATH ${MKLDNN_INSTALL_DIR} NATIVE_MKLDNN_INSTALL_DIR) + file(TO_NATIVE_PATH ${MKLDNN_SHARED_LIB} NATIVE_MKLDNN_SHARED_LIB) ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_INSTALL_DIR}/bin/dnnl.dll ${MKLDNN_SHARED_LIB}) + COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll ${NATIVE_MKLDNN_SHARED_LIB} /Y)) add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt) add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM @@ -109,7 +111,7 @@ if(WIN32) add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM - COMMAND for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) + COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on) add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64) else(WIN32) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 4cf9b626d15472206f47cd604d0b5b87089c4476..a4df5756ce015d14e0a366643ed6e0c45385657c 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -24,6 +24,7 @@ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") IF(WIN32) SET(MKLML_VER "mklml_win_2019.0.5.20190502" CACHE STRING "" FORCE) SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) + SET(MKLML_URL_MD5 ff8c5237570f03eea37377ccfc95a08a) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) @@ -33,6 +34,7 @@ ELSE() # Now enable csrmm function in mklml library temporarily, it will be updated as offical version later. SET(MKLML_VER "csrmm_mklml_lnx_2019.0.5" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + SET(MKLML_URL_MD5 bc6a7faea6a2a9ad31752386f3ae87da) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) @@ -48,10 +50,15 @@ cache_third_party(${MKLML_PROJECT} URL ${MKLML_URL} DIR MKLML_SOURCE_DIR) +# Ninja Generator can not establish the correct dependency relationship between the imported library with target, +# the product file in the ExternalProject need to be specified manually, please refer to +# https://stackoverflow.com/questions/54866067/cmake-and-ninja-missing-and-no-known-rule-to-make-it +# It is the same to all other ExternalProject. ExternalProject_Add( ${MKLML_PROJECT} ${EXTERNAL_PROJECT_LOG_ARGS} "${MKLML_DOWNLOAD_CMD}" + URL_MD5 ${MKLML_URL_MD5} PREFIX ${MKLML_PREFIX_DIR} DOWNLOAD_DIR ${MKLML_SOURCE_DIR} SOURCE_DIR ${MKLML_SOURCE_DIR} @@ -60,7 +67,9 @@ ExternalProject_Add( BUILD_COMMAND "" UPDATE_COMMAND "" INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/include ${MKLML_INC_DIR} && - ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/lib ${MKLML_LIB_DIR} + ${CMAKE_COMMAND} -E copy_directory ${MKLML_SOURCE_DIR}/lib ${MKLML_LIB_DIR} + BUILD_BYPRODUCTS ${MKLML_LIB} + BUILD_BYPRODUCTS ${MKLML_IOMP_LIB} ) INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 19ba6d15c59ea802cc94ea6138871c15cb49077b..a6033a20c6fb06c6e6b26100c1997b7881767e85 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -19,6 +19,10 @@ SET(CBLAS_SOURCE_DIR ${THIRD_PARTY_PATH}/openblas/src/extern_openblas) SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) SET(CBLAS_REPOSITORY ${GIT_URL}/xianyi/OpenBLAS.git) SET(CBLAS_TAG v0.3.7) +if(APPLE AND WITH_ARM) + SET(CBLAS_TAG v0.3.13) +endif() + if(WITH_MIPS) SET(CBLAS_TAG v0.3.13) endif() diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index c108c05368c915f6d4998d46713cda315dfb93ff..a2b6ddadb625f67f119cc314970f1a654cf0c0ab 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -239,6 +239,10 @@ endif() -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON ${OPTIONAL_CACHE_ARGS} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX} + BUILD_BYPRODUCTS ${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX} ) ENDFUNCTION() diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake index bdfd335172d877d7e294c898dad7e3a554f5531c..40d198b2958339d938961d7d75fe357826b4e227 100644 --- a/cmake/external/pslib.cmake +++ b/cmake/external/pslib.cmake @@ -53,7 +53,10 @@ ExternalProject_Add( DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PSLIB_LIB} ) ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake index 7b00474a650706b6de6e549c56ca94485cfc2300..d69c27a197b25a7320e7755f26b7a151628e1c62 100644 --- a/cmake/external/pslib_brpc.cmake +++ b/cmake/external/pslib_brpc.cmake @@ -52,7 +52,10 @@ ExternalProject_Add( DOWNLOAD_NO_PROGRESS 1 UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${PSLIB_BRPC_LIB} ) ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake new file mode 100644 index 0000000000000000000000000000000000000000..f5b85cc71a25f12285bb02648df55c3d88ec8e53 --- /dev/null +++ b/cmake/external/rocksdb.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(ROCKSDB_SOURCES_DIR ${THIRD_PARTY_PATH}/rocksdb) +SET(ROCKSDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocksdb) +SET(ROCKSDB_INCLUDE_DIR "${ROCKSDB_INSTALL_DIR}/include" CACHE PATH "rocksdb include directory." FORCE) +SET(ROCKSDB_LIBRARIES "${ROCKSDB_INSTALL_DIR}/lib/librocksdb.a" CACHE FILEPATH "rocksdb library." FORCE) +SET(ROCKSDB_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +INCLUDE_DIRECTORIES(${ROCKSDB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_rocksdb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${ROCKSDB_SOURCES_DIR} + GIT_REPOSITORY "https://github.com/facebook/rocksdb" + GIT_TAG v6.10.1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DWITH_BZ2=OFF + -DWITH_GFLAGS=OFF + -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} +# BUILD_BYPRODUCTS ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a + INSTALL_COMMAND mkdir -p ${ROCKSDB_INSTALL_DIR}/lib/ + && cp ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/librocksdb.a ${ROCKSDB_LIBRARIES} + && cp -r ${ROCKSDB_SOURCES_DIR}/src/extern_rocksdb/include ${ROCKSDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1 +) + +ADD_DEPENDENCIES(extern_rocksdb snappy) + +ADD_LIBRARY(rocksdb STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET rocksdb PROPERTY IMPORTED_LOCATION ${ROCKSDB_LIBRARIES}) +ADD_DEPENDENCIES(rocksdb extern_rocksdb) + +LIST(APPEND external_project_dependencies rocksdb) + diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index ab9cb02307c1f04384f8e12e843c121c01995d12..fb4c1c7cc8a3d57846648b5638f54adf40b50416 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -22,8 +22,15 @@ set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy includ if(WIN32) SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") + IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") + add_custom_command(TARGET extern_snappy POST_BUILD + COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib + ) + ENDIF() + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") else() SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") endif() ExternalProject_Add( @@ -33,35 +40,26 @@ ExternalProject_Add( PREFIX ${SNAPPY_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_TESTING=OFF - -DSNAPPY_BUILD_TESTS:BOOL=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DSNAPPY_BUILD_TESTS:BOOL=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${SNAPPY_LIBRARIES} ) -IF(WIN32) - IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") - add_custom_command(TARGET extern_snappy POST_BUILD - COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib - ) - ENDIF() - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") -else(WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") -endif (WIN32) add_library(snappy STATIC IMPORTED GLOBAL) set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 100b9153394690f6d872a4f16fb0a1ee5827b89f..532ebaaf5c0643a86fcf24022d0084fb572877b5 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -24,7 +24,7 @@ SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) # in case of low internet speed #set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1) +set(WARPCTC_TAG 37ece0e1bbe8a0019a63ac7e6462c36591c66a5b) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) @@ -32,6 +32,14 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) +IF(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) + IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() @@ -59,7 +67,7 @@ if(WITH_ASCEND OR WITH_ASCEND_CL) -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} @@ -76,8 +84,24 @@ if(WITH_ASCEND OR WITH_ASCEND_CL) CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES} ) else() + if(WIN32) + set(WARPCTC_C_FLAGS $) + set(WARPCTC_C_FLAGS_DEBUG $) + set(WARPCTC_C_FLAGS_RELEASE $) + set(WARPCTC_CXX_FLAGS $) + set(WARPCTC_CXX_FLAGS_RELEASE $) + set(WARPCTC_CXX_FLAGS_DEBUG $) + else() + set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS}) + set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) + endif() ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} @@ -85,17 +109,17 @@ else() "${WARPCTC_DOWNLOAD_CMD}" PREFIX ${WARPCTC_PREFIX_DIR} SOURCE_DIR ${WARPCTC_SOURCE_DIR} - #UPDATE_COMMAND "" + UPDATE_COMMAND "" PATCH_COMMAND "" - BUILD_ALWAYS 1 + #BUILD_ALWAYS 1 CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=$ - -DCMAKE_C_FLAGS_DEBUG=$ - -DCMAKE_C_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS=$ - -DCMAKE_CXX_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS_DEBUG=$ + -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_ROCM=${WITH_ROCM} @@ -110,18 +134,10 @@ else() CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES} ) endif() - -IF(WIN32) - SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -else(WIN32) - SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -ENDIF(WIN32) - MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake index 610a692ef12c6ae6f992fff8e4e65f48f3aeb01f..eabcabf7430633bd14bcf9814f112e7a4d043336 100644 --- a/cmake/external/xbyak.cmake +++ b/cmake/external/xbyak.cmake @@ -46,7 +46,9 @@ ExternalProject_Add( SOURCE_DIR ${XBYAK_SOURCE_DIR} # UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT} + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} ) add_library(xbyak INTERFACE) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index f846623602ed79a5bd84268436a59ede1957364b..42de34fb52061af23eee28377659ed4cbbb4de0a 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -7,52 +7,74 @@ SET(XPU_PROJECT "extern_xpu") SET(XPU_API_LIB_NAME "libxpuapi.so") SET(XPU_RT_LIB_NAME "libxpurt.so") -if(NOT XPU_SDK_ROOT) - if (WITH_AARCH64) - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/aarch64/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) - elseif(WITH_SUNWAY) - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) - else() - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE) - endif() - - SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") - SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") - SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") - SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") - SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") - - SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") - SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") - - SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") - - FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(XPU)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY xpu/include xpu/lib \n" - " DESTINATION ${XPU_INSTALL_DIR})\n") - - ExternalProject_Add( - ${XPU_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${XPU_SOURCE_DIR} - DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz - && tar xvf xpu.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} - ) -else() - SET(XPU_API_INC_DIR "${XPU_SDK_ROOT}/XTDK/include/") - SET(XPU_API_LIB "${XPU_SDK_ROOT}/XTDK/shlib/libxpuapi.so") - SET(XPU_RT_LIB "${XPU_SDK_ROOT}/XTDK/runtime/shlib/libxpurt.so") - SET(XPU_LIB_DIR "${XPU_SDK_ROOT}/XTDK/shlib/") -endif() +IF(WITH_AARCH64) + SET(XPU_XRE_DIR_NAME "xre-kylin_aarch64") + SET(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64") + SET(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64") +ELSEIF(WITH_SUNWAY) + SET(XPU_XRE_DIR_NAME "xre-deepin_sw6_64") + SET(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64") + SET(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64") +ELSEIF(WITH_BDCENTOS) + SET(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64") + SET(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64") + SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") +ELSEIF(WITH_UBUNTU) + SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") + SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") + SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") +ELSEIF(WITH_CENTOS) + SET(XPU_XRE_DIR_NAME "xre-centos7_x86_64") + SET(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64") + SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") + +ELSE () + SET(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64") + SET(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64") + SET(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64") +ENDIF() + +SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") +SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210701") +SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) +SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) +SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) +SET(XPU_PACK_DEPENCE_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh" CACHE STRING "" FORCE) + +SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") +SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") +SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") +SET(XPU_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") +SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib") + +SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}") +SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}") + +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib") -INCLUDE_DIRECTORIES(${XPU_API_INC_DIR}) +FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(XPU)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY xpu/include xpu/lib \n" + " DESTINATION ${XPU_INSTALL_DIR})\n") + +ExternalProject_Add( + ${XPU_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${XPU_SOURCE_DIR} + DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget ${XPU_PACK_DEPENCE_URL} + && bash pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL} ${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} + + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT} + BUILD_BYPRODUCTS ${XPU_API_LIB} + BUILD_BYPRODUCTS ${XPU_RT_LIB} +) + +INCLUDE_DIRECTORIES(${XPU_INC_DIR}) ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL) set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}") @@ -62,7 +84,7 @@ generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake") TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) -if (WITH_XPU_BKCL) +IF(WITH_XPU_BKCL) MESSAGE(STATUS "Compile with XPU BKCL!") ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL) @@ -71,15 +93,11 @@ if (WITH_XPU_BKCL) SET(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR}) TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB}) -else(WITH_XPU_BKCL) - TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ) -endif(WITH_XPU_BKCL) - -if(NOT XPU_SDK_ROOT) - ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) -else() - ADD_CUSTOM_TARGET(extern_xpu DEPENDS xpulib) -endif() +ELSE(WITH_XPU_BKCL) + TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) +ENDIF(WITH_XPU_BKCL) + +ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) # Ensure that xpu/api.h can be included without dependency errors. file(GENERATE OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/.xpu_headers_dummy.cc CONTENT "") diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index bdd7df190ff106178266fbd47716e7d70fd229bd..0279d4e2a835c2c1fa2bf8e2f4cafd21391accfc 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -21,10 +21,7 @@ set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") set(XXHASH_REPOSITORY ${GIT_URL}/Cyan4973/xxHash.git) set(XXHASH_TAG v0.6.5) -cache_third_party(extern_xxhash - REPOSITORY ${XXHASH_REPOSITORY} - TAG ${XXHASH_TAG} - DIR XXHASH_SOURCE_DIR) +INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) IF(APPLE) SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/Makefile && make lib) @@ -32,6 +29,17 @@ ELSEIF(UNIX) SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/Makefile && make lib) ENDIF() +if (WIN32) + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") +else() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") +endif () + +cache_third_party(extern_xxhash + REPOSITORY ${XXHASH_REPOSITORY} + TAG ${XXHASH_TAG} + DIR XXHASH_SOURCE_DIR) + if(WIN32) ExternalProject_Add( extern_xxhash @@ -54,6 +62,7 @@ if(WIN32) -DBUILD_SHARED_LIBS=OFF ${OPTIONAL_CACHE_ARGS} TEST_COMMAND "" + BUILD_BYPRODUCTS ${XXHASH_LIBRARIES} ) else() ExternalProject_Add( @@ -68,16 +77,10 @@ else() BUILD_COMMAND ${BUILD_CMD} INSTALL_COMMAND make PREFIX=${XXHASH_INSTALL_DIR} install TEST_COMMAND "" + BUILD_BYPRODUCTS ${XXHASH_LIBRARIES} ) endif() -if (WIN32) - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") -else() - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") -endif () -INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) - add_library(xxhash STATIC IMPORTED GLOBAL) set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES}) include_directories(${XXHASH_INCLUDE_DIR}) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 4464787a0c2a64066585e8f308c68a62286478e9..f1a015f6304a386fcc4cb985e4d0523d0d8eabb6 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -25,6 +25,12 @@ set(ZLIB_TAG v1.2.8) INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers. INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h. +IF(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) +ELSE(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) +ENDIF(WIN32) + cache_third_party(extern_zlib REPOSITORY ${ZLIB_REPOSITORY} TAG ${ZLIB_TAG} @@ -51,12 +57,8 @@ ExternalProject_Add( CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_BYPRODUCTS ${ZLIB_LIBRARIES} ) -IF(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) -ELSE(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) -ENDIF(WIN32) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index a2ddad557c2956f7de21bceaf7a6699e8dfbed43..7afff25664bbbb6f8ac93392dc39ed621e57e849 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -28,7 +28,12 @@ function(CheckCompilerCXX14Flag) endfunction() CheckCompilerCXX14Flag() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") +if(NOT WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") +else() + set(CMAKE_CXX_STANDARD 14) +endif() + # safe_set_flag # # Set a compile flag only if compiler is support @@ -181,8 +186,11 @@ endif() endif(NOT WIN32) if (APPLE) - # On Mac OS X build fat binaries with x86_64 architectures by default. - set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) + if(WITH_ARM) + set (CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Build architectures for OSX" FORCE) + else(WITH_ARM) + set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) + endif(WITH_ARM) # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 set (COMMON_FLAGS -Wno-deprecated-register) endif(APPLE) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index a5c74a46631e9d76fa78261f706a1853a80bab32..cea65f17fbe836ee5951805dfdf5d3078087ba44 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -92,7 +92,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}) # including io directory for inference lib paddle_api.h include_directories("${PADDLE_SOURCE_DIR}/paddle/fluid/framework/io") -if(NOT APPLE) +if(NOT APPLE AND NOT WIN32) find_package(Threads REQUIRED) link_libraries(${CMAKE_THREAD_LIBS_INIT}) if(WITH_PSLIB OR WITH_DISTRIBUTE) @@ -100,7 +100,7 @@ if(NOT APPLE) else() set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") endif() -endif(NOT APPLE) +endif() set_property(GLOBAL PROPERTY FLUID_MODULES "") # find all fluid modules is used for paddle fluid static library @@ -391,7 +391,7 @@ function(cc_binary TARGET_NAME) endfunction(cc_binary) function(cc_test_build TARGET_NAME) - if(WITH_TESTING) + if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -409,14 +409,12 @@ function(cc_test_build TARGET_NAME) if(WITH_ROCM) target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB}) endif() + check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS}) endif() - - check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS}) - endfunction() function(cc_test_run TARGET_NAME) - if(WITH_TESTING) + if(WITH_TESTING AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set(oneValueArgs "") set(multiValueArgs COMMAND ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) diff --git a/cmake/hip.cmake b/cmake/hip.cmake index 4c492d7cc48f0657f5389e23ddccc4561708c4a8..514f5ea9deaa32e2c7a926dd38a2c2f8d80682d6 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -85,3 +85,5 @@ message(STATUS "HIP library name: ${hip_library_name}") # set HIP link libs find_library(ROCM_HIPRTC_LIB ${hip_library_name} HINTS ${HIP_PATH}/lib) message(STATUS "ROCM_HIPRTC_LIB: ${ROCM_HIPRTC_LIB}") + +include(thrust) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 9694a7bc59c12a96e1c0c33488895ae94dbf2a03..3dcf0b74f7940f7a0d9c9b5242e7df96bf274cdc 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -146,12 +146,19 @@ copy(inference_lib_dist SRCS ${THREADPOOL_INCLUDE_DIR}/ThreadPool.h DSTS ${dst_dir}) -# Only GPU need cudaErrorMessage.pb +# GPU must copy externalErrorMsg.pb IF(WITH_GPU) - set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/cudaerror/data") - copy(inference_lib_dist - SRCS ${cudaerror_INCLUDE_DIR} - DSTS ${dst_dir}) + set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/externalError/data") + copy(inference_lib_dist + SRCS ${externalError_INCLUDE_DIR} + DSTS ${dst_dir}) +ENDIF() + +IF(WITH_XPU) + set(dst_dir "${PADDLE_INFERENCE_INSTALL_DIR}/third_party/install/xpu") + copy(inference_lib_dist + SRCS ${XPU_INC_DIR} ${XPU_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) ENDIF() # CMakeCache Info @@ -193,10 +200,7 @@ copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/* DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h - DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) -copy(inference_lib_dist - SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex.h DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h @@ -259,7 +263,7 @@ copy(fluid_lib_dist set(module "platform") set(platform_lib_deps profiler_proto error_codes_proto) if(WITH_GPU) - set(platform_lib_deps ${platform_lib_deps} cuda_error_proto) + set(platform_lib_deps ${platform_lib_deps} external_error_proto) endif(WITH_GPU) add_dependencies(fluid_lib_dist ${platform_lib_deps}) @@ -323,16 +327,22 @@ function(version version_file) "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" "WITH_MKL: ${WITH_MKL}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n" - "WITH_GPU: ${WITH_GPU}\n") + "WITH_GPU: ${WITH_GPU}\n" + "WITH_ROCM: ${WITH_ROCM}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" "CUDNN version: v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}\n") endif() + if(WITH_ROCM) + file(APPEND ${version_file} + "HIP version: ${HIP_VERSION}\n" + "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") + endif() file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") if(TENSORRT_FOUND) file(APPEND ${version_file} - "WITH_TENSORRT: ${TENSORRT_FOUND}\n" "TensorRT version: v${TENSORRT_MAJOR_VERSION}\n") + "WITH_TENSORRT: ${TENSORRT_FOUND}\n" "TensorRT version: v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION}\n") endif() if(WITH_LITE) file(APPEND ${version_file} "WITH_LITE: ${WITH_LITE}\n" "LITE_GIT_TAG: ${LITE_GIT_TAG}\n") diff --git a/cmake/init.cmake b/cmake/init.cmake index b11156d2e9986f879dcf4dd63354edb81c493260..0ebcdc8ceeebcabc2c7c639076939cef5c0fe546 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -17,17 +17,34 @@ if(NOT WIN32) set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") + + if(WITH_GPU) + set(CMAKE_CUDA_FLAGS_DEBUG "-g") + set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") + endif() else() + set(CMAKE_C_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") + set(CMAKE_C_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") + set(CMAKE_C_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") + + set(CMAKE_CXX_FLAGS_DEBUG "/MDd /Zi /Ob0 /Od /RTC1") + set(CMAKE_CXX_FLAGS_RELEASE "/MD /O2 /Ob2 /DNDEBUG") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "/MD /Zi /O2 /Ob1 /DNDEBUG") + set(CMAKE_CXX_FLAGS_MINSIZEREL "/MD /O1 /Ob1 /DNDEBUG") + + if(WITH_GPU) + set(CMAKE_CUDA_FLAGS_DEBUG "-Xcompiler=\"-MDd -Zi -Ob0 -Od /RTC1\"") + set(CMAKE_CUDA_FLAGS_RELEASE "-Xcompiler=\"-MD -O2 -Ob2\" -DNDEBUG") + set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-Xcompiler=\"-MD -Zi -O2 -Ob1\" -DNDEBUG") + set(CMAKE_CUDA_FLAGS_MINSIZEREL "-Xcompiler=\"-MD -O1 -Ob1\" -DNDEBUG") + endif() + # It can specify CUDA compile flag manualy, # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous # because CUDA will update by nvidia, then error will occur. # Now, it's only used in VS2015 + CUDA:[10.0, 10.2] set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) endif() - -if(WITH_GPU) - set(CMAKE_CUDA_FLAGS_DEBUG "-g") - set(CMAKE_CUDA_FLAGS_RELEASE "-O3 -DNDEBUG") - set(CMAKE_CUDA_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") - set(CMAKE_CUDA_FLAGS_MINSIZEREL "-O1 -DNDEBUG") -endif() diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake new file mode 100644 index 0000000000000000000000000000000000000000..f482f423dc5c12c5c0d7d87401c5d4a1d85a218a --- /dev/null +++ b/cmake/miopen.cmake @@ -0,0 +1,67 @@ +if(NOT WITH_ROCM) + return() +endif() + +# Now we don't support ROCm on windows +if(WIN32) + return() +endif() + +set(MIOPEN_ROOT ${ROCM_PATH}/miopen CACHE PATH "MIOPEN ROOT") + +find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h" + PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/include ${MIOPEN_ROOT}/local/include + $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/include $ENV{MIOPEN_ROOT}/local/include + NO_DEFAULT_PATH +) + +get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) + +find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so" + PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} + $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 + NO_DEFAULT_PATH + DOC "Path to MIOpen library.") + +if(MIOPEN_INCLUDE_DIR AND MIOPEN_LIBRARY) + set(MIOPEN_FOUND ON) +else() + set(MIOPEN_FOUND OFF) +endif() + +macro(find_miopen_version miopen_header_file) + file(READ ${miopen_header_file} MIOPEN_VERSION_FILE_CONTENTS) + get_filename_component(MIOPEN_LIB_PATH ${MIOPEN_LIBRARY} DIRECTORY) + + string(REGEX MATCH "define MIOPEN_VERSION_MAJOR +([0-9]+)" MIOPEN_MAJOR_VERSION + "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_MAJOR +([0-9]+)" "\\1" + MIOPEN_MAJOR_VERSION "${MIOPEN_MAJOR_VERSION}") + string(REGEX MATCH "define MIOPEN_VERSION_MINOR +([0-9]+)" MIOPEN_MINOR_VERSION + "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_MINOR +([0-9]+)" "\\1" + MIOPEN_MINOR_VERSION "${MIOPEN_MINOR_VERSION}") + string(REGEX MATCH "define MIOPEN_VERSION_PATCH +([0-9]+)" MIOPEN_PATCH_VERSION + "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_PATCH +([0-9]+)" "\\1" + MIOPEN_PATCH_VERSION "${MIOPEN_PATCH_VERSION}") + string(REGEX MATCH "define MIOPEN_VERSION_TWEAK +([0-9]+)" MIOPEN_TWEAK_VERSION + "${MIOPEN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define MIOPEN_VERSION_TWEAK +([0-9]+)" "\\1" + MIOPEN_TWEAK_VERSION "${MIOPEN_TWEAK_VERSION}") + + if(NOT MIOPEN_MAJOR_VERSION) + set(MIOPEN_VERSION "???") + else() + add_definitions("-DMIOPEN_MAJOR_VERSION=\"${MIOPEN_MAJOR_VERSION}\"") + math(EXPR MIOPEN_VERSION + "${MIOPEN_MAJOR_VERSION} * 1000 + + ${MIOPEN_MINOR_VERSION} * 10 + ${MIOPEN_PATCH_VERSION}") + message(STATUS "Current MIOpen header is ${MIOPEN_INCLUDE_DIR}/miopen/miopen.h " + "Current MIOpen version is v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}.${MIOPEN_PATCH_VERSION}. ") + endif() +endmacro() + +if(MIOPEN_FOUND) + find_miopen_version(${MIOPEN_INCLUDE_DIR}/miopen/version.h) +endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7dac91e531e4cfd16fed211ef659350262dd3153..a200b948dea45dd0ee9e5ced5fbc38e1eb4349b7 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -44,6 +44,9 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + if (WITH_NV_JETSON) + list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu") + endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) @@ -180,8 +183,8 @@ function(op_library TARGET) list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc") list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc") list(REMOVE_ITEM hip_srcs "cholesky_op.cu") - list(REMOVE_ITEM hip_srcs "correlation_op.cu") list(REMOVE_ITEM hip_srcs "multinomial_op.cu") + list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() @@ -205,7 +208,7 @@ function(op_library TARGET) endif() # Define operators that don't need pybind here. - foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "nccl_op" + foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 889332fc55704f96f0afbd5815042ae8c0ba1035..e4b22befff8508f677288bba7b938556b796b68a 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -47,11 +47,23 @@ if(TENSORRT_FOUND) file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION + "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION + "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION + "${TENSORRT_VERSION_FILE_CONTENTS}") if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") file(READ ${TENSORRT_INCLUDE_DIR}/NvInferVersion.h TENSORRT_VERSION_FILE_CONTENTS) string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_MINOR +([0-9]+)" TENSORRT_MINOR_VERSION + "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_PATCH +([0-9]+)" TENSORRT_PATCH_VERSION + "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX MATCH "define NV_TENSORRT_BUILD +([0-9]+)" TENSORRT_BUILD_VERSION + "${TENSORRT_VERSION_FILE_CONTENTS}") endif() if("${TENSORRT_MAJOR_VERSION}" STREQUAL "") @@ -60,9 +72,15 @@ if(TENSORRT_FOUND) string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") + string(REGEX REPLACE "define NV_TENSORRT_MINOR +([0-9]+)" "\\1" + TENSORRT_MINOR_VERSION "${TENSORRT_MINOR_VERSION}") + string(REGEX REPLACE "define NV_TENSORRT_PATCH +([0-9]+)" "\\1" + TENSORRT_PATCH_VERSION "${TENSORRT_PATCH_VERSION}") + string(REGEX REPLACE "define NV_TENSORRT_BUILD +([0-9]+)" "\\1" + TENSORRT_BUILD_VERSION "${TENSORRT_BUILD_VERSION}") message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " - "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") + "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}.${TENSORRT_MINOR_VERSION}.${TENSORRT_PATCH_VERSION}.${TENSORRT_BUILD_VERSION} ") include_directories(${TENSORRT_INCLUDE_DIR}) link_directories(${TENSORRT_LIBRARY}) add_definitions(-DPADDLE_WITH_TENSORRT) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index f90fa3509d63d4765ef78638b9f4f28d0e22bed2..aa31745c21340c4bef521f9cbf44535a634c4eb7 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -108,13 +108,19 @@ ENDMACRO() # 2. NAME: The name of file, that determin the dirname # FUNCTION(file_download_and_uncompress URL NAME) - MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}") + set(options "") + set(oneValueArgs MD5) + set(multiValueArgs "") + cmake_parse_arguments(URL "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + MESSAGE(STATUS "Download dependence[${NAME}] from ${URL}, MD5: ${URL_MD5}") SET(${NAME}_INCLUDE_DIR ${THIRD_PARTY_PATH}/${NAME}/data PARENT_SCOPE) ExternalProject_Add( - extern_download_${NAME} + download_${NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${THIRD_PARTY_PATH}/${NAME} URL ${URL} + URL_MD5 ${URL_MD5} + TIMEOUT 120 DOWNLOAD_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ SOURCE_DIR ${THIRD_PARTY_PATH}/${NAME}/data/ DOWNLOAD_NO_PROGRESS 1 @@ -123,7 +129,7 @@ FUNCTION(file_download_and_uncompress URL NAME) UPDATE_COMMAND "" INSTALL_COMMAND "" ) - set(third_party_deps ${third_party_deps} extern_download_${NAME} PARENT_SCOPE) + set(third_party_deps ${third_party_deps} download_${NAME} PARENT_SCOPE) ENDFUNCTION() @@ -209,6 +215,8 @@ list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boos list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool) include(cblas) # find first, then download, build, install openblas + +message(STATUS "CBLAS_PROVIDER: ${CBLAS_PROVIDER}") if(${CBLAS_PROVIDER} STREQUAL MKLML) list(APPEND third_party_deps extern_mklml) elseif(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) @@ -242,8 +250,22 @@ if(WITH_GPU) include(external/cub) # download cub list(APPEND third_party_deps extern_cub) endif() - set(CUDAERROR_URL "http://paddlepaddledeps.bj.bcebos.com/cudaErrorMessage.tar.gz" CACHE STRING "" FORCE) - file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage + set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 c0749523ebb536eb7382487d645d9cd4) # download file externalErrorMsg.tar.gz + if(WITH_TESTING) + # copy externalErrorMsg.pb, just for unittest can get error message correctly. + set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) + if(WIN32 AND (NOT "${CMAKE_GENERATOR}" STREQUAL "Ninja")) + set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/fluid/third_party/externalError/data) + else() + set(DST_DIR1 ${CMAKE_BINARY_DIR}/paddle/third_party/externalError/data) + endif() + set(DST_DIR2 ${CMAKE_BINARY_DIR}/python/paddle/include/third_party/externalError/data) + add_custom_command(TARGET download_externalError POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR1} + COMMAND ${CMAKE_COMMAND} -E copy_directory ${SRC_DIR} ${DST_DIR2} + COMMENT "copy_directory from ${SRC_DIR} to ${DST_DIR}") + endif() endif(WITH_GPU) if(WITH_XPU) @@ -261,6 +283,14 @@ if(WITH_PSLIB) if(WITH_PSLIB_BRPC) include(external/pslib_brpc) # download, build, install pslib_brpc list(APPEND third_party_deps extern_pslib_brpc) + else() + include(external/snappy) + list(APPEND third_party_deps extern_snappy) + + include(external/leveldb) + list(APPEND third_party_deps extern_leveldb) + include(external/brpc) + list(APPEND third_party_deps extern_brpc) endif() endif(WITH_PSLIB) @@ -296,6 +326,11 @@ if (WITH_PSCORE) include(external/libmct) # download, build, install libmct list(APPEND third_party_deps extern_libmct) + + if (WITH_HETERPS) + include(external/rocksdb) # download, build, install libmct + list(APPEND third_party_deps extern_rocksdb) + endif() endif() if(WITH_XBYAK) diff --git a/cmake/thrust.cmake b/cmake/thrust.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ff415b1e3c4bf6ff190b2f8e97cfb9da52259435 --- /dev/null +++ b/cmake/thrust.cmake @@ -0,0 +1,24 @@ +function(add_thrust_patches_if_necessary) + set(thrust_detect_file ${PROJECT_BINARY_DIR}/detect_thrust.cu) + file(WRITE ${thrust_detect_file} "" + "#include \"thrust/version.h\"\n" + "#include \"thrust/shuffle.h\"\n" + "#include \"stdio.h\"\n" + "int main() {\n" + " int version = THRUST_VERSION;\n" + " printf(\"%d\", version);\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" + "--run" "${thrust_detect_file}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res ERROR_QUIET) + if(NOT nvcc_res EQUAL 0) + set(thrust_patches "${PADDLE_SOURCE_DIR}/patches/thrust") + message(STATUS "Add thrust patches: ${thrust_patches}") + include_directories(${thrust_patches}) + endif() +endfunction() + +add_thrust_patches_if_necessary() diff --git a/go/README_cn.md b/go/README_cn.md deleted file mode 100644 index 040540e939bc3a0993e7c963b281ad91fbfe1ffc..0000000000000000000000000000000000000000 --- a/go/README_cn.md +++ /dev/null @@ -1,56 +0,0 @@ -# Paddle 预测golang API - -## 安装 -首先cmake编译时打开`-DON_INFER=ON`,在编译目录下得到``paddle_inference_c_install_dir``,将该目录移动到当前目录中并重命名为`paddle_c` - -## 在Go中使用Paddle预测 -首先创建预测配置 -``` go -config := paddle.NewAnalysisConfig() -config.SetModel(model_file, params_file) -config.SwitchUseFeedFetchOps(false) -config.SwitchSpecifyInputNames(true) -``` - -创建predictor -``` go -predictor := paddle.NewPredictor(config) -``` - -获取输入Tensor和输出Tensor -``` go -inputs = predictor.GetInputTensors() -``` - -设置输入数据(假设只有一个输入) -``` go -input := inputs[0] -input.SetValue(data) -input.Reshape([]int32{1, 3, 300, 300}) -``` - -运行预测 -``` go -predictor.ZeroCopyRun() -``` - -获取输入Tensor的真实值 -``` go -output := outputs[0] -predictor.GetZeroCopyOutput(output) -value := reflect.ValueOf(output.Value()) -shape, dtype := paddle.ShapeAndTypeOf(value) -output_data := value.Interface().([][]float32) -``` - -## 示例 -源码见[mobilenet](./demo/mobilenet.go) - -下载[数据](https://paddle-inference-dist.cdn.bcebos.com/mobilenet-test-model-data.tar.gz)并解压到当前目录 - -运行 -```bash -go mod init github.com/paddlepaddle -export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH -go run ./demo/mobilenet.go -``` diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go deleted file mode 100644 index c1ca2e967f72dc6646a6785d86ba59c709bfe25c..0000000000000000000000000000000000000000 --- a/go/demo/mobilenet.go +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -package main - -import "github.com/paddlepaddle/paddle" -import "strings" -import "io/ioutil" -import "strconv" -import "reflect" - -func main() { - config := paddle.NewAnalysisConfig() - config.SetModel("data/model/__model__", "data/model/__params__") - config.DisableGlogInfo() - config.SwitchUseFeedFetchOps(false) - config.SwitchSpecifyInputNames(true) - - predictor := paddle.NewPredictor(config) - - println("============== paddle inference ==============") - println("input num: ", predictor.GetInputNum()) - println("input name: ", predictor.GetInputNames()[0]) - println("output num: ", predictor.GetOutputNum()) - println("output name: ", predictor.GetInputNames()[0]) - println("============== run inference =================") - - input := predictor.GetInputTensors()[0] - output := predictor.GetOutputTensors()[0] - - filename := "data/data.txt" - data := ReadData(filename) - input.SetValue(data[:1 * 3 * 300 * 300]) - input.Reshape([]int32{1, 3, 300, 300}) - - predictor.SetZeroCopyInput(input) - predictor.ZeroCopyRun() - predictor.GetZeroCopyOutput(output) - - println("============= parse output ===================") - output_val := output.Value() - value := reflect.ValueOf(output_val) - shape, dtype := paddle.ShapeAndTypeOf(value) - switch dtype { - case paddle.PaddleDType(paddle.FLOAT32): - v := value.Interface().([][]float32) - println("v: ", v[0][0], v[0][1], "...") - case paddle.PaddleDType(paddle.UINT8): - v := value.Interface().([][]uint8) - println("v: ", v[0][0], v[0][1], "...") - case paddle.PaddleDType(paddle.INT32): - v := value.Interface().([][]int32) - println("v: ", v[0][0], v[0][1], "...") - case paddle.PaddleDType(paddle.INT64): - v := value.Interface().([][]int64) - println("v: ", v[0][0], v[0][1], "...") - } - println(shape[0], shape[1]) - println(output.Shape()[0]) -} - -func ReadData(filename string) []float32 { - file_bytes, _ := ioutil.ReadFile(filename) - data_slice := strings.Split(string(file_bytes), " ") - var result []float32 - for _, n := range data_slice { - r, _ := strconv.ParseFloat(n, 32) - result = append(result, float32(r)) - } - return result -} diff --git a/go/demo/mobilenet_c.cc b/go/demo/mobilenet_c.cc deleted file mode 100644 index 6a5cc683c9f9a9c88f73a3ca5ebac274210f3b7a..0000000000000000000000000000000000000000 --- a/go/demo/mobilenet_c.cc +++ /dev/null @@ -1,74 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include -#include -#include - -void SetConfig(PD_AnalysisConfig *); -void ReadData(float *data, int size); - -int main(int argc, char *argv[]) { - PD_AnalysisConfig *config = PD_NewAnalysisConfig(); - SetConfig(config); - PD_Predictor *predictor = PD_NewPredictor(config); - - int input_num = PD_GetInputNum(predictor); - printf("Input num: %d\n", input_num); - int output_num = PD_GetOutputNum(predictor); - printf("Output num: %d\n", output_num); - - PD_ZeroCopyTensor input; - PD_InitZeroCopyTensor(&input); - input.name = const_cast(PD_GetInputName(predictor, 0)); // NOLINT - input.data.capacity = sizeof(float) * 1 * 3 * 300 * 300; - input.data.length = input.data.capacity; - input.data.data = malloc(input.data.capacity); - int shape[] = {1, 3, 300, 300}; - input.shape.data = static_cast(shape); - input.shape.capacity = sizeof(shape); - input.shape.length = sizeof(shape); - input.dtype = PD_FLOAT32; - ReadData((float *)input.data.data, 1 * 3 * 300 * 300); // NOLINT - float *data = (float *)input.data.data; // NOLINT - PD_SetZeroCopyInput(predictor, &input); - int *shape_ptr = (int *)input.shape.data; // NOLINT - - PD_ZeroCopyRun(predictor); - PD_ZeroCopyTensor output; - PD_InitZeroCopyTensor(&output); - output.name = const_cast(PD_GetOutputName(predictor, 0)); // NOLINT - PD_GetZeroCopyOutput(predictor, &output); - - PD_DestroyZeroCopyTensor(&output); - - PD_DeleteAnalysisConfig(config); - PD_DeletePredictor(predictor); - return 0; -} - -void SetConfig(PD_AnalysisConfig *config) { - PD_SetModel(config, "data/model/__model__", "data/model/__params__"); - PD_SwitchUseFeedFetchOps(config, false); - PD_SwitchSpecifyInputNames(config, true); - PD_DisableGlogInfo(config); - // PD_SwitchIrOptim(config, false); -} - -void ReadData(float *data, int n) { - FILE *fp = fopen("data/data.txt", "r"); - for (int i = 0; i < n; i++) { - fscanf(fp, "%f", &data[i]); - } - fclose(fp); -} diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc deleted file mode 100644 index b4f42dab6790bfb6dd33860a8ada704166bb74ac..0000000000000000000000000000000000000000 --- a/go/demo/mobilenet_c_exp.cc +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include -#include -#include - -void ReadData(float* data, int size); - -int main(int argc, char* argv[]) { - PD_Config* config = PD_ConfigCreate(); - PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__"); - PD_ConfigDisableGlogInfo(config); - - PD_Predictor* predictor = PD_PredictorCreate(config); - // config has destroyed in PD_PredictorCreate - config = NULL; - - int input_num = PD_PredictorGetInputNum(predictor); - printf("Input num: %d\n", input_num); - int output_num = PD_PredictorGetOutputNum(predictor); - printf("Output num: %d\n", output_num); - - PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); - PD_Tensor* input_tensor = - PD_PredictorGetInputHandle(predictor, input_names->data[0]); - PD_OneDimArrayCstrDestroy(input_names); - input_names = NULL; - - int32_t shape[] = {1, 3, 300, 300}; - float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300); // NOLINT - ReadData(data, 1 * 3 * 300 * 300); // NOLINT - PD_TensorReshape(input_tensor, 4, shape); - PD_TensorCopyFromCpuFloat(input_tensor, data); - free(data); - data = NULL; - PD_PredictorRun(predictor); - - PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); - PD_Tensor* output_tensor = - PD_PredictorGetOutputHandle(predictor, output_names->data[0]); - PD_OneDimArrayCstrDestroy(output_names); - output_names = nullptr; - - PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor); - int32_t size = 1; - for (size_t index = 0; index < out_shape->size; ++index) { - size = size * out_shape->data[index]; - } - PD_OneDimArrayInt32Destroy(out_shape); - out_shape = NULL; - - data = (float*)malloc(sizeof(float) * size); // NOLINT - PD_TensorCopyToCpuFloat(output_tensor, data); - free(data); - data = NULL; - - PD_TensorDestroy(output_tensor); - output_tensor = NULL; - PD_TensorDestroy(input_tensor); - input_tensor = NULL; - PD_PredictorDestroy(predictor); - predictor = NULL; - - return 0; -} - -void ReadData(float* data, int n) { - FILE* fp = fopen("data/data.txt", "r"); - for (int i = 0; i < n; i++) { - fscanf(fp, "%f", &data[i]); - } - fclose(fp); -} diff --git a/go/demo/mobilenet_cxx.cc b/go/demo/mobilenet_cxx.cc deleted file mode 100644 index 7bdd6b2b03b24e2393e746edde754f763e9dd986..0000000000000000000000000000000000000000 --- a/go/demo/mobilenet_cxx.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include -#include -#include - -void SetConfig(paddle::AnalysisConfig *); - -int main(int argc, char *argv[]) { - paddle::AnalysisConfig config; - SetConfig(&config); - auto predictor = paddle::CreatePaddlePredictor(config); - auto input_name = predictor->GetInputNames()[0]; - auto input = predictor->GetInputTensor(input_name); - std::cout << predictor->GetOutputNames()[0] << std::endl; - std::vector shape{1, 3, 300, 300}; - input->Reshape(std::move(shape)); - std::vector data(1 * 300 * 300 * 3); - std::ifstream fin("data/data.txt"); - for (int i = 0; i < data.size(); i++) { - fin >> data[i]; - } - - input->copy_from_cpu(data.data()); - predictor->ZeroCopyRun(); - auto output_name = predictor->GetOutputNames()[0]; - auto output = predictor->GetOutputTensor(output_name); - return 0; -} - -void SetConfig(paddle::AnalysisConfig *config) { - config->SetModel("data/model/__model__", "data/model/__params__"); - config->SwitchUseFeedFetchOps(false); - config->SwitchSpecifyInputNames(true); - config->SwitchIrOptim(false); -} diff --git a/go/paddle/config.go b/go/paddle/config.go deleted file mode 100644 index 68a31230997bed73fbab1c1d1c7af123e353cf97..0000000000000000000000000000000000000000 --- a/go/paddle/config.go +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package paddle - -// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c -// #include -// #include -// #include -import "C" - -import "runtime" -import "unsafe" - -type Precision C.Precision - -const ( - Precision_FLOAT32 Precision = C.kFloat32 - Precision_INT8 Precision = C.kInt8 - Precision_HALF Precision = C.kHalf -) - -type AnalysisConfig struct { - c *C.PD_AnalysisConfig -} - -func NewAnalysisConfig() *AnalysisConfig { - c_config := C.PD_NewAnalysisConfig() - config := &AnalysisConfig{c: c_config} - runtime.SetFinalizer(config, (*AnalysisConfig).finalize) - return config -} - -func (config *AnalysisConfig) finalize() { - C.PD_DeleteAnalysisConfig(config.c) -} - -func (config *AnalysisConfig) SetModel(model, params string) { - //C.printString((*C.char)(unsafe.Pointer(&s[0]))) - c_model := C.CString(model) - defer C.free(unsafe.Pointer(c_model)) - var c_params *C.char - if params == "" { - c_params = nil - } else { - c_params = C.CString(params) - defer C.free(unsafe.Pointer(c_params)) - } - - C.PD_SetModel(config.c, c_model, c_params) -} - -func (config *AnalysisConfig) ModelDir() string { - return C.GoString(C.PD_ModelDir(config.c)) -} - -func (config *AnalysisConfig) ProgFile() string { - return C.GoString(C.PD_ProgFile(config.c)) -} - -func (config *AnalysisConfig) ParamsFile() string { - return C.GoString(C.PD_ParamsFile(config.c)) -} - -func (config *AnalysisConfig) EnableUseGpu(memory_pool_init_size_mb int, device_id int) { - C.PD_EnableUseGpu(config.c, C.int(memory_pool_init_size_mb), C.int(device_id)) -} - -func (config *AnalysisConfig) DisableGpu() { - C.PD_DisableGpu(config.c) -} - -func (config *AnalysisConfig) UseGpu() bool { - return ConvertCBooleanToGo(C.PD_UseGpu(config.c)) -} - -func (config *AnalysisConfig) GpuDeviceId() int { - return int(C.PD_GpuDeviceId(config.c)) -} - -func (config *AnalysisConfig) MemoryPoolInitSizeMb() int { - return int(C.PD_MemoryPoolInitSizeMb(config.c)) -} - -func (config *AnalysisConfig) FractionOfGpuMemoryForPool() float32 { - return float32(C.PD_FractionOfGpuMemoryForPool(config.c)) -} - -func (config *AnalysisConfig) EnableCudnn() { - C.PD_EnableCUDNN(config.c) -} - -func (config *AnalysisConfig) CudnnEnabled() bool { - return ConvertCBooleanToGo(C.PD_CudnnEnabled(config.c)) -} - -func (config *AnalysisConfig) SwitchIrOptim(x bool) { - C.PD_SwitchIrOptim(config.c, C.bool(x)) -} - -func (config *AnalysisConfig) IrOptim() bool { - return ConvertCBooleanToGo(C.PD_IrOptim(config.c)) -} - -func (config *AnalysisConfig) SwitchUseFeedFetchOps(x bool) { - C.PD_SwitchUseFeedFetchOps(config.c, C.bool(x)) -} - -func (config *AnalysisConfig) UseFeedFetchOpsEnabled() bool { - return ConvertCBooleanToGo(C.PD_UseFeedFetchOpsEnabled(config.c)) -} - -func (config *AnalysisConfig) SwitchSpecifyInputNames(x bool) { - C.PD_SwitchSpecifyInputNames(config.c, C.bool(x)) -} - -func (config *AnalysisConfig) SpecifyInputName() bool { - return ConvertCBooleanToGo(C.PD_SpecifyInputName(config.c)) -} - -func (config *AnalysisConfig) EnableTensorRtEngine(workspace_size int, max_batch_size int, min_subgraph_size int, precision Precision, use_static bool, use_calib_mode bool) { - C.PD_EnableTensorRtEngine(config.c, C.int(workspace_size), C.int(max_batch_size), C.int(min_subgraph_size), C.Precision(precision), C.bool(use_static), C.bool(use_calib_mode)) -} - -func (config *AnalysisConfig) TensorrtEngineEnabled() bool { - return ConvertCBooleanToGo(C.PD_TensorrtEngineEnabled(config.c)) -} - -func (config *AnalysisConfig) SwitchIrDebug(x bool) { - C.PD_SwitchIrDebug(config.c, C.bool(x)) -} - -func (config *AnalysisConfig) EnableMkldnn() { - C.PD_EnableMKLDNN(config.c) -} - -func (config *AnalysisConfig) MkldnnEnabled() bool { - return ConvertCBooleanToGo(C.PD_MkldnnEnabled(config.c)) -} - -func (config *AnalysisConfig) SetCpuMathLibraryNumThreads(n int) { - C.PD_SetCpuMathLibraryNumThreads(config.c, C.int(n)) -} - -func (config *AnalysisConfig) CpuMathLibraryNumThreads() int { - return int(C.PD_CpuMathLibraryNumThreads(config.c)) -} - -func (config *AnalysisConfig) EnableMkldnnQuantizer() { - C.PD_EnableMkldnnQuantizer(config.c) -} - -func (config *AnalysisConfig) EnableMkldnnBfloat16() { - C.PD_EnableMkldnnBfloat16(config.c) -} - -func (config *AnalysisConfig) MkldnnQuantizerEnabled() bool { - return ConvertCBooleanToGo(C.PD_MkldnnQuantizerEnabled(config.c)) -} - -func (config *AnalysisConfig) MkldnnBfloat16Enabled() bool { - return ConvertCBooleanToGo(C.PD_MkldnnBfloat16Enabled(config.c)) -} -// SetModelBuffer -// ModelFromMemory - -func (config *AnalysisConfig) EnableMemoryOptim() { - C.PD_EnableMemoryOptim(config.c) -} - -func (config *AnalysisConfig) MemoryOptimEnabled() bool { - return ConvertCBooleanToGo(C.PD_MemoryOptimEnabled(config.c)) -} - -func (config *AnalysisConfig) EnableProfile() { - C.PD_EnableProfile(config.c) -} - -func (config *AnalysisConfig) ProfileEnabled() bool { - return ConvertCBooleanToGo(C.PD_ProfileEnabled(config.c)) -} - -func (config *AnalysisConfig) DisableGlogInfo() { - C.PD_DisableGlogInfo(config.c) -} - -func (config *AnalysisConfig) DeletePass(pass string) { - c_pass := C.CString(pass) - defer C.free(unsafe.Pointer(c_pass)) - C.PD_DeletePass(config.c, c_pass) -} - -func (config *AnalysisConfig) SetInValid() { - C.PD_SetInValid(config.c) -} - -func (config *AnalysisConfig) IsValid() bool { - return ConvertCBooleanToGo(C.PD_IsValid(config.c)) -} diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go deleted file mode 100644 index 5f2b2c81a60549dfdbf22dd31a98560e7e3a8cee..0000000000000000000000000000000000000000 --- a/go/paddle/predictor.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package paddle - -// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c -// #include -// #include "paddle_c_api.h" -import "C" - -import "reflect" -import "runtime" -import "unsafe" - -type Predictor struct { - c *C.PD_Predictor -} - -func NewPredictor(config *AnalysisConfig) *Predictor { - c_predictor := C.PD_NewPredictor((*config).c) - predictor := &Predictor{c: c_predictor} - runtime.SetFinalizer(predictor, (*Predictor).finalize) - return predictor -} - -func (predictor *Predictor) finalize() { - C.PD_DeletePredictor(predictor.c) -} - -func DeletePredictor(predictor *Predictor) { - C.PD_DeletePredictor(predictor.c) -} - -func (predictor *Predictor) GetInputNum() int { - return int(C.PD_GetInputNum(predictor.c)) -} - -func (predictor *Predictor) GetOutputNum() int { - return int(C.PD_GetOutputNum(predictor.c)) -} - -func (predictor *Predictor) GetInputName(n int) string { - return C.GoString(C.PD_GetInputName(predictor.c, C.int(n))) -} - -func (predictor *Predictor) GetOutputName(n int) string { - return C.GoString(C.PD_GetOutputName(predictor.c, C.int(n))) -} - -func (predictor *Predictor) GetInputTensors() [](*ZeroCopyTensor) { - var result [](*ZeroCopyTensor) - for i := 0; i < predictor.GetInputNum(); i++ { - tensor := NewZeroCopyTensor() - tensor.c.name = C.PD_GetInputName(predictor.c, C.int(i)) - result = append(result, tensor) - } - return result -} - -func (predictor *Predictor) GetOutputTensors() [](*ZeroCopyTensor) { - var result [](*ZeroCopyTensor) - for i := 0; i < predictor.GetOutputNum(); i++ { - tensor := NewZeroCopyTensor() - tensor.c.name = C.PD_GetOutputName(predictor.c, C.int(i)) - result = append(result, tensor) - } - return result -} - -func (predictor *Predictor) GetInputNames() []string { - names := make([]string, predictor.GetInputNum()) - for i := 0; i < len(names); i++ { - names[i] = predictor.GetInputName(i) - } - return names -} - -func (predictor *Predictor) GetOutputNames() []string { - names := make([]string, predictor.GetOutputNum()) - for i := 0; i < len(names); i++ { - names[i] = predictor.GetOutputName(i) - } - return names -} - -func (predictor *Predictor) SetZeroCopyInput(tensor *ZeroCopyTensor) { - C.PD_SetZeroCopyInput(predictor.c, tensor.c) -} - -func (predictor *Predictor) GetZeroCopyOutput(tensor *ZeroCopyTensor) { - C.PD_GetZeroCopyOutput(predictor.c, tensor.c) - tensor.name = C.GoString(tensor.c.name) - var shape []int32 - shape_hdr := (*reflect.SliceHeader)(unsafe.Pointer(&shape)) - shape_hdr.Data = uintptr(unsafe.Pointer(tensor.c.shape.data)) - shape_hdr.Len = int(tensor.c.shape.length / C.sizeof_int) - shape_hdr.Cap = int(tensor.c.shape.length / C.sizeof_int) - tensor.Reshape(shape) -} - -func (predictor *Predictor) ZeroCopyRun() { - C.PD_ZeroCopyRun(predictor.c) -} diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go deleted file mode 100644 index 6fbcf039f88a7cc43a5d28f0433c9feb965566f0..0000000000000000000000000000000000000000 --- a/go/paddle/tensor.go +++ /dev/null @@ -1,255 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package paddle - -// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c -// #include -// #include -// #include -// #include -import "C" - -import "runtime" -import "reflect" -import "unsafe" -import ( - "bytes" - "encoding/binary" -) - -type PaddleDType C.PD_DataType - -const ( - FLOAT32 PaddleDType = C.PD_FLOAT32 - INT32 PaddleDType = C.PD_INT32 - INT64 PaddleDType = C.PD_INT64 - UINT8 PaddleDType = C.PD_UINT8 - UNKDTYPE PaddleDType = C.PD_UNKDTYPE -) - -var types = []struct { - gotype reflect.Type - dtype PaddleDType -}{ - {reflect.TypeOf(float32(0)), FLOAT32}, - {reflect.TypeOf(int32(0)), INT32}, - {reflect.TypeOf(int64(0)), INT64}, - {reflect.TypeOf(uint8(0)), UINT8}, -} - -func TypeOfShape(dtype PaddleDType, shape []int32) reflect.Type { - var ret reflect.Type - for _, t := range types { - if dtype == PaddleDType(t.dtype) { - ret = t.gotype - break - } - } - - if ret == nil { - panic(bug("Data %v type is not support", dtype)) - } - - for range shape { - ret = reflect.SliceOf(ret) - } - return ret -} - -type ZeroCopyTensor struct { - c *C.PD_ZeroCopyTensor - name string - shape []int32 -} - -func NewZeroCopyTensor() *ZeroCopyTensor { - c_tensor := C.PD_NewZeroCopyTensor() - - tensor := &ZeroCopyTensor{c: c_tensor} - runtime.SetFinalizer(tensor, (*ZeroCopyTensor).finalize) - return tensor -} - -func (tensor *ZeroCopyTensor) finalize() { - C.PD_DeleteZeroCopyTensor(tensor.c) -} - -func (tensor *ZeroCopyTensor) Shape() []int32 { - return tensor.shape -} - -func (tensor *ZeroCopyTensor) Name() string { - return C.GoString(tensor.c.name) -} - -func (tensor *ZeroCopyTensor) Rename(name string) { - tensor.name = name - tensor.c.name = (*C.char)(unsafe.Pointer(tensor.c.name)) - //tensor.c.name = C.CString(tensor.name) - //defer C.free(unsafe.Pointer(tensor.c.name)) -} - -func (tensor *ZeroCopyTensor) Reshape(shape []int32) { - tensor.shape = make([]int32, len(shape)) - copy(tensor.shape, shape) - length := C.sizeof_int * C.size_t(len(shape)) - if tensor.c.shape.capacity < C.size_t(length) { - if tensor.c.shape.capacity != C.size_t(0) { - C.free(tensor.c.shape.data) - } - tensor.c.shape.data = C.malloc(length) - tensor.c.shape.capacity = length - } - tensor.c.shape.length = length - C.memcpy(tensor.c.shape.data, unsafe.Pointer(&shape[0]), length) -} - -func (tensor *ZeroCopyTensor) DataType() PaddleDType { - return PaddleDType(tensor.c.dtype) -} - -func (tensor *ZeroCopyTensor) SetValue(value interface{}) { - val := reflect.ValueOf(value) - shape, dtype := ShapeAndTypeOf(val) - tensor.Reshape(shape) - num := numel(shape) - length := C.size_t(SizeofDataType(dtype) * num) - if tensor.c.data.capacity < length { - if tensor.c.data.capacity != C.size_t(0) { - C.free(tensor.c.data.data) - } - tensor.c.data.data = C.malloc(length) - tensor.c.data.capacity = length - } - tensor.c.data.length = length - - switch dtype { - case PaddleDType(UINT8): - data := val.Interface().([]uint8) - C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length) - case PaddleDType(INT32): - data := val.Interface().([]int32) - C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length) - case PaddleDType(INT64): - data := val.Interface().([]int64) - C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length) - case PaddleDType(FLOAT32): - data := val.Interface().([]float32) - C.memcpy(tensor.c.data.data, unsafe.Pointer(&data[0]), length) - } - tensor.c.dtype = C.PD_DataType(dtype) -} - -func TypeOf(dtype PaddleDType, shape []int32) reflect.Type { - var ret reflect.Type - for _, t := range types { - if t.dtype == dtype { - ret = t.gotype - break - } - } - - for range shape { - ret = reflect.SliceOf(ret) - } - return ret -} - -func (tensor *ZeroCopyTensor) Value() interface{} { - t := TypeOf(PaddleDType(tensor.c.dtype), tensor.shape) - value := reflect.New(t) - c_bytes := tensor.c.data.data - length := tensor.c.data.length - var slice []byte - if unsafe.Sizeof(unsafe.Pointer(nil)) == 8 { - slice = (*[1<<50 - 1]byte)(unsafe.Pointer(c_bytes))[:length:length] - } else { - slice = (*[1 << 30]byte)(unsafe.Pointer(c_bytes))[:length:length] - } - r := bytes.NewReader(slice) - DecodeTensor(r, tensor.Shape(), t, value) - return reflect.Indirect(value).Interface() -} - -func Endian() binary.ByteOrder { - buf := [2]byte{} - *(*uint16)(unsafe.Pointer(&buf[0])) = uint16(0xABCD) - - var endian binary.ByteOrder - - switch buf { - case [2]byte{0xCD, 0xAB}: - endian = binary.LittleEndian - case [2]byte{0xAB, 0xCD}: - endian = binary.BigEndian - default: - panic("Could not determine native endianness.") - } - return endian -} - -func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Value) { - switch t.Kind() { - case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32: - binary.Read(r, Endian(), ptr.Interface()) - case reflect.Slice: - value := reflect.Indirect(ptr) - value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0]))) - if len(shape) == 1 && value.Len() > 0 { - switch value.Index(0).Kind() { - case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32: - binary.Read(r, Endian(), value.Interface()) - return - } - } - - for i := 0; i < value.Len(); i++ { - DecodeTensor(r, shape[1:], t.Elem(), value.Index(i).Addr()) - } - } -} - -func SizeofDataType(dtype PaddleDType) int32 { - switch dtype { - case UINT8: - return int32(C.sizeof_uchar) - case INT32: - return int32(C.sizeof_int) - case INT64: - return int32(C.sizeof_longlong) - case FLOAT32: - return int32(C.sizeof_float) - } - return -1 -} - -func ShapeAndTypeOf(val reflect.Value) (shape []int32, dt PaddleDType) { - gotype := val.Type() - for gotype.Kind() == reflect.Array || gotype.Kind() == reflect.Slice { - shape = append(shape, int32(val.Len())) - if val.Len() > 0 { - val = val.Index(0) - } - gotype = gotype.Elem() - } - - for _, t := range types { - if gotype.Kind() == t.gotype.Kind() { - return shape, PaddleDType(t.dtype) - } - } - return shape, dt -} diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index a2062d82c8130bbde5e59e6bd0ca3515c38537b1..905347d031b35b39b43879c7bd78ab39e933a5b3 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -11,8 +11,8 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() -add_subdirectory(table) add_subdirectory(service) +add_subdirectory(table) add_subdirectory(test) add_subdirectory(index_dataset) diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h new file mode 100644 index 0000000000000000000000000000000000000000..3f84b5c4b212e2b261a4ef9b3f21163e5ef705b2 --- /dev/null +++ b/paddle/fluid/distributed/common/sparse_sharding_merge.h @@ -0,0 +1,311 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include + +#include +#include +#include +#include // NOLINT +#include + +#include +#include "boost/lexical_cast.hpp" +#include "glog/logging.h" +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/dim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/string/split.h" + +constexpr int FG = 256 * 1024 * 1024; +constexpr int Q_SIZE = 10000; +constexpr int BUCKET = 10; +constexpr char XEOF[] = "EOF"; + +using boost::lexical_cast; + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +namespace paddle { +namespace distributed { + +class ShardingMerge { + public: + ShardingMerge() {} + ~ShardingMerge() {} + + void Merge(const std::vector &inputs, + const std::vector &feasigns, const std::string &output, + const int embedding_dim) { + pool_.reset(new ::ThreadPool(inputs.size())); + + std::vector> tasks(inputs.size()); + std::vector> rows; + rows.resize(inputs.size()); + + auto begin = GetCurrentUS(); + for (int x = 0; x < inputs.size(); ++x) { + tasks[x] = pool_->enqueue([this, x, &rows, &inputs, &feasigns]() -> int { + DeserializeRowsFromFile(inputs[x], feasigns[x], &rows[x]); + return 0; + }); + } + + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); + } + + int64_t total_rows = 0; + for (auto x = 0; x < rows.size(); x++) { + total_rows += rows[x].size(); + } + + auto end = GetCurrentUS(); + + VLOG(0) << "got " << total_rows + << " feasigin ids from sparse embedding using " << end - begin; + + std::vector total_dims = {total_rows, + static_cast(embedding_dim)}; + + std::vector> batch_buckets; + batch_buckets.resize(inputs.size()); + + for (int x = 0; x < rows.size(); ++x) { + batch_buckets[x] = bucket(rows[x].size(), BUCKET); + } + + std::ofstream out(output, std::ios::binary); + + begin = GetCurrentUS(); + SerializeRowsToStream(out, rows, batch_buckets, total_rows); + end = GetCurrentUS(); + VLOG(0) << "write rows to oostrream using " << end - begin; + + begin = GetCurrentUS(); + SerializePreTensorToStream(out, total_dims); + end = GetCurrentUS(); + VLOG(0) << "write pretensor to oostrream using " << end - begin; + + begin = GetCurrentUS(); + SerializeValueToStream(out, inputs, batch_buckets, embedding_dim); + end = GetCurrentUS(); + VLOG(0) << "write values to oostrream using " << end - begin; + } + + private: + void SerializeRowsToStream(std::ostream &os, + const std::vector> &rows, + const std::vector> &batch_buckets, + int64_t total_rows) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + + { + // the 2st field, rows information + os.write(reinterpret_cast(&total_rows), sizeof(total_rows)); + + for (int b = 0; b < BUCKET; ++b) { + for (int x = 0; x < batch_buckets.size(); ++x) { + auto begin = batch_buckets[x][b]; + auto end = batch_buckets[x][b + 1]; + + if (end - begin == 0) continue; + + os.write(reinterpret_cast(rows[x].data() + begin), + sizeof(int64_t) * (end - begin)); + } + } + + // the 3st field, the height of SelectedRows + int64_t height = total_rows; + os.write(reinterpret_cast(&height), sizeof(height)); + } + } + + void SerializePreTensorToStream(std::ostream &os, + const std::vector &dims) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + framework::proto::VarType::TensorDesc desc; + desc.set_data_type(framework::proto::VarType::FP32); + auto *pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + } + + void SerializeValueToVec(std::ifstream &in, const int batch, + const int embedding_dim, std::vector *out) { + auto queue = + std::make_shared>>(); + + auto read = [batch, &in, &queue]() { + std::string line; + std::vector columns; + std::vector values_str; + + int count = 0; + + while (std::getline(in, line)) { + ++count; + columns = string::Split(line, '\t'); + + if (columns.size() != 5) { + VLOG(0) << "unexpected line: " << line << ", skip it"; + continue; + } + + values_str = string::Split(columns[4], ','); + queue->Push(values_str); + + if (count >= batch) { + break; + } + } + queue->Push({}); + }; + + auto write = [embedding_dim, &out, &queue]() { + std::vector values_str; + std::string line; + + while (true) { + queue->Pop(&values_str); + + if (values_str.size() == 0) { + break; + } + + for (int x = 0; x < embedding_dim; ++x) { + float v = 0.0; + try { + v = lexical_cast(values_str[x]); + } catch (boost::bad_lexical_cast &e) { + VLOG(0) << " get unexpected line: " << line; + } + out->push_back(v); + } + } + }; + + std::thread p_read(read); + std::thread p_write(write); + p_read.join(); + p_write.join(); + } + + void SerializeVecToStream(std::ostream &out, + const std::vector &value) { + out.write(reinterpret_cast(value.data()), + static_cast(sizeof(float) * value.size())); + } + + void SerializeValueToStream( + std::ostream &out, const std::vector &ins, + const std::vector> &batch_buckets, + const int embedding_dim) { + std::vector> in_streams; + + for (int x = 0; x < ins.size(); ++x) { + in_streams.emplace_back(std::make_shared(ins[x])); + } + + std::vector> tasks(ins.size()); + + for (int b = 0; b < BUCKET; ++b) { + std::vector> values; + values.resize(tasks.size()); + + auto begin = GetCurrentUS(); + + for (int x = 0; x < tasks.size(); ++x) { + auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b]; + values[x].clear(); + values[x].reserve(batch * embedding_dim); + } + + for (int x = 0; x < tasks.size(); ++x) { + tasks[x] = + pool_->enqueue([this, b, x, &out, &in_streams, &batch_buckets, + &values, embedding_dim]() -> int { + auto batch = batch_buckets[x][b + 1] - batch_buckets[x][b]; + if (batch == 0) return 0; + SerializeValueToVec(*(in_streams[x].get()), batch, embedding_dim, + &values[x]); + return 0; + }); + } + + for (size_t x = 0; x < tasks.size(); ++x) { + tasks[x].wait(); + } + + auto end = GetCurrentUS(); + + auto begin1 = GetCurrentUS(); + for (size_t x = 0; x < tasks.size(); ++x) { + SerializeVecToStream(out, values[x]); + } + auto end1 = GetCurrentUS(); + + VLOG(0) << "serialize buckets " << b << " read using " << end - begin + << ", to oostream using " << end1 - begin1; + } + } + + void DeserializeRowsFromFile(const std::string &input_file, + const int64_t feasigns, + std::vector *rows) { + std::string line; + std::vector columns; + std::ifstream file(input_file); + + rows->reserve(feasigns); + + while (std::getline(file, line)) { + columns = string::Split(line, '\t'); + if (columns.size() != 5) { + VLOG(0) << "unexpected line: " << line << ", skip it"; + continue; + } + rows->push_back(std::stoull(columns[0])); + } + + VLOG(0) << "parse " << rows->size() << " embedding rows from " + << input_file; + } + + private: + std::unique_ptr<::ThreadPool> pool_; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/common/utils.h b/paddle/fluid/distributed/common/utils.h index f81f84b1e117510443a5698a6ba1574262f640a5..2305001ad6f8f90eea49efa88b2a2615176f3ffb 100644 --- a/paddle/fluid/distributed/common/utils.h +++ b/paddle/fluid/distributed/common/utils.h @@ -14,6 +14,8 @@ #pragma once +#include + #include #include #include @@ -83,5 +85,11 @@ std::string to_string(const std::vector& vec) { } return ss.str(); } + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; } -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc index dfd55f16e1a065e46b2186a6a589eabc1ac3b431..9e2a0b35224a4ea3a6198e20309d3a335999651e 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/fleet.cc @@ -417,8 +417,10 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync( return; } -void FleetWrapper::LoadModel(const std::string& path, const int mode) { - auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode)); +void FleetWrapper::LoadModel(const std::string& path, const std::string& mode) { + auto* communicator = Communicator::GetInstance(); + auto ret = communicator->_worker_ptr->load(path, mode); + // auto ret = pserver_ptr_->_worker_ptr->load(path, std::to_string(mode)); ret.wait(); if (ret.get() != 0) { LOG(ERROR) << "load model from path:" << path << " failed"; @@ -429,8 +431,11 @@ void FleetWrapper::LoadModel(const std::string& path, const int mode) { void FleetWrapper::LoadModelOneTable(const uint64_t table_id, const std::string& path, const int mode) { + auto* communicator = Communicator::GetInstance(); auto ret = - pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode)); + communicator->_worker_ptr->load(table_id, path, std::to_string(mode)); + // auto ret = + // pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode)); ret.wait(); if (ret.get() != 0) { LOG(ERROR) << "load model of table id: " << table_id diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index 0da5d1e2bf987f38de3b9a03c659fc5e1841eca1..1b2bde85de04c2f0dc528700f10d087199c56c50 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -200,7 +200,7 @@ class FleetWrapper { void PrintTableStat(const uint64_t table_id); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff - void LoadModel(const std::string& path, const int mode); + void LoadModel(const std::string& path, const std::string& mode); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff void LoadModelOneTable(const uint64_t table_id, const std::string& path, diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc index 58f85d98fb09c6576daa0816be2d58c90c5a8a42..3e573bbdd2de97130a109ddb583a724cf363c6be 100644 --- a/paddle/fluid/distributed/index_dataset/index_sampler.cc +++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc @@ -13,13 +13,10 @@ // limitations under the License. #include "paddle/fluid/distributed/index_dataset/index_sampler.h" -#include "paddle/fluid/operators/math/sampler.h" namespace paddle { namespace distributed { -using Sampler = paddle::operators::math::Sampler; - std::vector> LayerWiseSampler::sample( const std::vector>& user_inputs, const std::vector& target_ids, bool with_hierarchy) { @@ -30,22 +27,7 @@ std::vector> LayerWiseSampler::sample( std::vector(user_feature_num + 2)); auto max_layer = tree_->Height(); - std::vector sampler_vec(max_layer - start_sample_layer_); - std::vector> layer_ids(max_layer - - start_sample_layer_); - - auto layer_index = max_layer - 1; size_t idx = 0; - while (layer_index >= start_sample_layer_) { - auto layer_codes = tree_->GetLayerCodes(layer_index); - layer_ids[idx] = tree_->GetNodes(layer_codes); - sampler_vec[idx] = new paddle::operators::math::UniformSampler( - layer_ids[idx].size() - 1, seed_); - layer_index--; - idx++; - } - - idx = 0; for (size_t i = 0; i < input_num; i++) { auto travel_codes = tree_->GetTravelCodes(target_ids[i], start_sample_layer_); @@ -76,18 +58,15 @@ std::vector> LayerWiseSampler::sample( for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) { int sample_res = 0; do { - sample_res = sampler_vec[j]->Sample(); - } while (layer_ids[j][sample_res].id() == travel_path[j].id()); + sample_res = sampler_vec_[j]->Sample(); + } while (layer_ids_[j][sample_res].id() == travel_path[j].id()); outputs[idx + idx_offset][user_feature_num] = - layer_ids[j][sample_res].id(); + layer_ids_[j][sample_res].id(); outputs[idx + idx_offset][user_feature_num + 1] = 0; } idx += layer_counts_[j]; } } - for (size_t i = 0; i < sampler_vec.size(); i++) { - delete sampler_vec[i]; - } return outputs; } diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h index 66882bedc9b76593b9b28f184fc26ff4897494e6..8813421446a21c1379ca872952fe8b367d0724ca 100644 --- a/paddle/fluid/distributed/index_dataset/index_sampler.h +++ b/paddle/fluid/distributed/index_dataset/index_sampler.h @@ -16,6 +16,7 @@ #include #include "paddle/fluid/distributed/index_dataset/index_wrapper.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -83,6 +84,23 @@ class LayerWiseSampler : public IndexSampler { } reverse(layer_counts_.begin(), layer_counts_.end()); VLOG(3) << "sample counts sum: " << layer_counts_sum_; + + auto max_layer = tree_->Height(); + sampler_vec_.clear(); + layer_ids_.clear(); + + auto layer_index = max_layer - 1; + size_t idx = 0; + while (layer_index >= start_sample_layer_) { + auto layer_codes = tree_->GetLayerCodes(layer_index); + layer_ids_.push_back(tree_->GetNodes(layer_codes)); + auto sampler_temp = + std::make_shared( + layer_ids_[idx].size() - 1, seed_); + sampler_vec_.push_back(sampler_temp); + layer_index--; + idx++; + } } std::vector> sample( const std::vector>& user_inputs, @@ -94,6 +112,8 @@ class LayerWiseSampler : public IndexSampler { std::shared_ptr tree_{nullptr}; int seed_{0}; int start_sample_layer_{1}; + std::vector> sampler_vec_; + std::vector> layer_ids_; }; } // end namespace distributed diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index a9370561a540bea3416508b45d8cbf8cb997ed33..a1440260bf2e77093bb937e62b13b54ad06a3e64 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include // NOLINT +#include "butil/object_pool.h" #include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/table.h" #include "paddle/fluid/framework/archive.h" @@ -196,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request, return 0; } - std::vector res_data; - res_data.resize(num * table->value_accesor()->select_size() / sizeof(float)); - table->pull_dense(res_data.data(), num); + auto res_data = butil::get_object>(); + res_data->resize(num * table->value_accesor()->select_size() / sizeof(float)); + table->pull_dense(res_data->data(), num); - cntl->response_attachment().append((char *)res_data.data(), - res_data.size() * sizeof(float)); + cntl->response_attachment().append((char *)(res_data->data()), + res_data->size() * sizeof(float)); + butil::return_object(res_data); return 0; } @@ -367,12 +369,13 @@ int32_t BrpcPsService::pull_sparse(Table *table, value.DeserializeFromBytes(const_cast(data)); - std::vector res_data; - res_data.resize(num * dim); - table->pull_sparse(res_data.data(), value); + auto res_data = butil::get_object>(); + res_data->resize(num * dim); + table->pull_sparse(res_data->data(), value); - cntl->response_attachment().append((char *)res_data.data(), - res_data.size() * sizeof(float)); + cntl->response_attachment().append((char *)(res_data->data()), + res_data->size() * sizeof(float)); + butil::return_object(res_data); return 0; } diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index eafb4d596cc1671db26189b84ea9d0c0c31ea398..70f2da6d7252cee0268bdd35999926a232bc5b34 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -80,11 +80,11 @@ std::future GraphBrpcClient::get_node_feat( [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { int ret = 0; auto *closure = (DownpourBrpcClosure *)done; - int fail_num = 0; + size_t fail_num = 0; for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { - if (closure->check_response(request_idx, - PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { + if (closure->check_response(request_idx, PS_GRAPH_GET_NODE_FEAT) != + 0) { ++fail_num; } else { auto &res_io_buffer = @@ -144,6 +144,163 @@ std::future GraphBrpcClient::get_node_feat( return fut; } + +std::future GraphBrpcClient::clear_nodes(uint32_t table_id) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + server_size, [&, server_size = this->server_size ](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + size_t fail_num = 0; + for (size_t request_idx = 0; request_idx < server_size; ++request_idx) { + if (closure->check_response(request_idx, PS_GRAPH_CLEAR) != 0) { + ++fail_num; + break; + } + } + ret = fail_num == 0 ? 0 : -1; + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + for (size_t i = 0; i < server_size; i++) { + int server_index = i; + closure->request(server_index)->set_cmd_id(PS_GRAPH_CLEAR); + closure->request(server_index)->set_table_id(table_id); + closure->request(server_index)->set_client_id(_client_id); + + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(server_index)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(server_index), + closure->request(server_index), + closure->response(server_index), closure); + } + return fut; +} +std::future GraphBrpcClient::add_graph_node( + uint32_t table_id, std::vector &node_id_list, + std::vector &is_weighted_list) { + std::vector> request_bucket; + std::vector> is_weighted_bucket; + bool add_weight = is_weighted_list.size() > 0; + std::vector server_index_arr; + std::vector index_mapping(server_size, -1); + for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_id_list[query_idx]); + if (index_mapping[server_index] == -1) { + index_mapping[server_index] = request_bucket.size(); + server_index_arr.push_back(server_index); + request_bucket.push_back(std::vector()); + if (add_weight) is_weighted_bucket.push_back(std::vector()); + } + request_bucket[index_mapping[server_index]].push_back( + node_id_list[query_idx]); + if (add_weight) + is_weighted_bucket[index_mapping[server_index]].push_back( + query_idx < is_weighted_list.size() ? is_weighted_list[query_idx] + : false); + } + size_t request_call_num = request_bucket.size(); + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, [&, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + size_t fail_num = 0; + for (size_t request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, PS_GRAPH_ADD_GRAPH_NODE) != + 0) { + ++fail_num; + } + } + ret = fail_num == request_call_num ? -1 : 0; + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = server_index_arr[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_ADD_GRAPH_NODE); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = request_bucket[request_idx].size(); + closure->request(request_idx) + ->add_params((char *)request_bucket[request_idx].data(), + sizeof(uint64_t) * node_num); + if (add_weight) { + bool weighted[is_weighted_bucket[request_idx].size() + 1]; + for (size_t j = 0; j < is_weighted_bucket[request_idx].size(); j++) + weighted[j] = is_weighted_bucket[request_idx][j]; + closure->request(request_idx) + ->add_params((char *)weighted, + sizeof(bool) * is_weighted_bucket[request_idx].size()); + } + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + return fut; +} +std::future GraphBrpcClient::remove_graph_node( + uint32_t table_id, std::vector &node_id_list) { + std::vector> request_bucket; + std::vector server_index_arr; + std::vector index_mapping(server_size, -1); + for (size_t query_idx = 0; query_idx < node_id_list.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_id_list[query_idx]); + if (index_mapping[server_index] == -1) { + index_mapping[server_index] = request_bucket.size(); + server_index_arr.push_back(server_index); + request_bucket.push_back(std::vector()); + } + request_bucket[index_mapping[server_index]].push_back( + node_id_list[query_idx]); + } + size_t request_call_num = request_bucket.size(); + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, [&, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + int fail_num = 0; + for (size_t request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_REMOVE_GRAPH_NODE) != 0) { + ++fail_num; + } + } + ret = fail_num == request_call_num ? -1 : 0; + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = server_index_arr[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_REMOVE_GRAPH_NODE); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = request_bucket[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)request_bucket[request_idx].data(), + sizeof(uint64_t) * node_num); + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + return fut; +} // char* &buffer,int &actual_size std::future GraphBrpcClient::batch_sample_neighboors( uint32_t table_id, std::vector node_ids, int sample_size, @@ -174,8 +331,8 @@ std::future GraphBrpcClient::batch_sample_neighboors( [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { int ret = 0; auto *closure = (DownpourBrpcClosure *)done; - int fail_num = 0; - for (int request_idx = 0; request_idx < request_call_num; + size_t fail_num = 0; + for (size_t request_idx = 0; request_idx < request_call_num; ++request_idx) { if (closure->check_response(request_idx, PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { @@ -254,13 +411,14 @@ std::future GraphBrpcClient::random_sample_nodes( auto &res_io_buffer = closure->cntl(0)->response_attachment(); butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); size_t bytes_size = io_buffer_itr.bytes_left(); - char buffer[bytes_size]; + char *buffer = new char[bytes_size]; auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); int index = 0; while (index < bytes_size) { ids.push_back(*(uint64_t *)(buffer + index)); index += GraphNode::id_size; } + delete[] buffer; } closure->set_promise_value(ret); }); @@ -292,7 +450,7 @@ std::future GraphBrpcClient::pull_graph_list( auto &res_io_buffer = closure->cntl(0)->response_attachment(); butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); size_t bytes_size = io_buffer_itr.bytes_left(); - char buffer[bytes_size]; + char *buffer = new char[bytes_size]; io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); int index = 0; while (index < bytes_size) { @@ -301,6 +459,7 @@ std::future GraphBrpcClient::pull_graph_list( index += node.get_size(false); res.push_back(node); } + delete buffer; } closure->set_promise_value(ret); }); diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h index 4e6775a4bedaf1a4028fe483f58be818ef1e3581..5696e8b08037b7027939f472f58ec79925143e4f 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -78,6 +78,13 @@ class GraphBrpcClient : public BrpcPsClient { const uint32_t& table_id, const std::vector& node_ids, const std::vector& feature_names, std::vector>& res); + + virtual std::future clear_nodes(uint32_t table_id); + virtual std::future add_graph_node( + uint32_t table_id, std::vector& node_id_list, + std::vector& is_weighted_list); + virtual std::future remove_graph_node( + uint32_t table_id, std::vector& node_id_list); virtual int32_t initialize(); int get_shard_num() { return shard_num; } void set_shard_num(int shard_num) { this->shard_num = shard_num; } diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc index bdd926278b624b9e9bfdf19a4f293784bef6e28f..52ac8c5d688a4ada72212923bdd478b788e422ee 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -24,6 +24,14 @@ namespace paddle { namespace distributed { +#define CHECK_TABLE_EXIST(table, request, response) \ + if (table == NULL) { \ + std::string err_msg("table not found with table_id:"); \ + err_msg.append(std::to_string(request.table_id())); \ + set_response_code(response, -1, err_msg.c_str()); \ + return -1; \ + } + int32_t GraphBrpcServer::initialize() { auto &service_config = _config.downpour_server_param().service_param(); if (!service_config.has_service_class()) { @@ -71,6 +79,58 @@ uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { return 0; } +int32_t GraphBrpcService::clear_nodes(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + ((GraphTable *)table)->clear_nodes(); + return 0; +} + +int32_t GraphBrpcService::add_graph_node(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 1) { + set_response_code( + response, -1, + "graph_get_node_feat request requires at least 2 arguments"); + return 0; + } + + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); + std::vector is_weighted_list; + if (request.params_size() == 2) { + size_t weight_list_size = request.params(1).size() / sizeof(bool); + bool *is_weighted_buffer = (bool *)(request.params(1).c_str()); + is_weighted_list = std::vector(is_weighted_buffer, + is_weighted_buffer + weight_list_size); + } + + ((GraphTable *)table)->add_graph_node(node_ids, is_weighted_list); + return 0; +} +int32_t GraphBrpcService::remove_graph_node(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 1) { + set_response_code( + response, -1, + "graph_get_node_feat request requires at least 1 argument"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); + + ((GraphTable *)table)->remove_graph_node(node_ids); + return 0; +} int32_t GraphBrpcServer::port() { return _server.listen_address().port; } int32_t GraphBrpcService::initialize() { @@ -92,21 +152,17 @@ int32_t GraphBrpcService::initialize() { &GraphBrpcService::graph_random_sample_nodes; _service_handler_map[PS_GRAPH_GET_NODE_FEAT] = &GraphBrpcService::graph_get_node_feat; - + _service_handler_map[PS_GRAPH_CLEAR] = &GraphBrpcService::clear_nodes; + _service_handler_map[PS_GRAPH_ADD_GRAPH_NODE] = + &GraphBrpcService::add_graph_node; + _service_handler_map[PS_GRAPH_REMOVE_GRAPH_NODE] = + &GraphBrpcService::remove_graph_node; // shard初始化,server启动后才可从env获取到server_list的shard信息 initialize_shard_info(); return 0; } -#define CHECK_TABLE_EXIST(table, request, response) \ - if (table == NULL) { \ - std::string err_msg("table not found with table_id:"); \ - err_msg.append(std::to_string(request.table_id())); \ - set_response_code(response, -1, err_msg.c_str()); \ - return -1; \ - } - int32_t GraphBrpcService::initialize_shard_info() { if (!_is_initialize_shard_info) { std::lock_guard guard(_initialize_shard_mutex); diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h index 32c572f9e6c2bf759c59190679bcf7570a807f2d..47c370572826ac2807e4ea5cb36cf3a667dfed10 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -86,6 +86,13 @@ class GraphBrpcService : public PsBaseService { int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); + int32_t clear_nodes(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t add_graph_node(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t remove_graph_node(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); int32_t barrier(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); int32_t load_one_table(Table *table, const PsRequestMessage &request, diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc index 61e4e0cf7bb9155d25c630296c2b55a7d3400bfc..39befb1a112c854a183903d76a71d9e6c920b215 100644 --- a/paddle/fluid/distributed/service/graph_py_service.cc +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -44,6 +44,9 @@ void GraphPyService::add_table_feat_conf(std::string table_name, } } +void add_graph_node(std::vector node_ids, + std::vector weight_list) {} +void remove_graph_node(std::vector node_ids) {} void GraphPyService::set_up(std::string ips_str, int shard_num, std::vector node_types, std::vector edge_types) { @@ -247,6 +250,34 @@ void GraphPyClient::load_edge_file(std::string name, std::string filepath, } } +void GraphPyClient::clear_nodes(std::string name) { + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = get_ps_client()->clear_nodes(table_id); + status.wait(); + } +} + +void GraphPyClient::add_graph_node(std::string name, + std::vector& node_ids, + std::vector& weight_list) { + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + get_ps_client()->add_graph_node(table_id, node_ids, weight_list); + status.wait(); + } +} + +void GraphPyClient::remove_graph_node(std::string name, + std::vector& node_ids) { + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = get_ps_client()->remove_graph_node(table_id, node_ids); + status.wait(); + } +} + void GraphPyClient::load_node_file(std::string name, std::string filepath) { // 'n' means load nodes and 'node_type' follows std::string params = "n" + name; diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h index c6657be96ba446d2f7538943aab43dd47e1868fb..da027fbae3e6f0ca1e902795b0640cee1e0b76cc 100644 --- a/paddle/fluid/distributed/service/graph_py_service.h +++ b/paddle/fluid/distributed/service/graph_py_service.h @@ -141,6 +141,10 @@ class GraphPyClient : public GraphPyService { void finalize_worker(); void load_edge_file(std::string name, std::string filepath, bool reverse); void load_node_file(std::string name, std::string filepath); + void clear_nodes(std::string name); + void add_graph_node(std::string name, std::vector& node_ids, + std::vector& weight_list); + void remove_graph_node(std::string name, std::vector& node_ids); int get_client_id() { return client_id; } void set_client_id(int client_id) { this->client_id = client_id; } void start_client(); diff --git a/paddle/fluid/distributed/service/ps_local_client.cc b/paddle/fluid/distributed/service/ps_local_client.cc index 2acc845a50890beb834676c3394f8dabc2a77e78..e949b21b02e6d9842ffae377a17610757a65ae75 100644 --- a/paddle/fluid/distributed/service/ps_local_client.cc +++ b/paddle/fluid/distributed/service/ps_local_client.cc @@ -42,17 +42,17 @@ int32_t PsLocalClient::initialize() { ::std::future PsLocalClient::load(const std::string& epoch, const std::string& mode) { // TODO - // for (auto& it : _table_map) { - // load(it.first, epoch, mode); - //} + for (auto& it : _table_map) { + load(it.first, epoch, mode); + } return done(); } ::std::future PsLocalClient::load(uint32_t table_id, const std::string& epoch, const std::string& mode) { // TODO - // auto* table_ptr = table(table_id); - // table_ptr->load(epoch, mode); + auto* table_ptr = table(table_id); + table_ptr->load(epoch, mode); return done(); } @@ -245,7 +245,6 @@ int32_t PsLocalClient::initialize() { ::std::future PsLocalClient::push_sparse_raw_gradient( size_t table_id, const uint64_t* keys, const float** update_values, size_t num, void* callback) { - VLOG(1) << "wxx push_sparse_raw_gradient"; PSClientClosure* closure = reinterpret_cast(callback); auto* accessor = table_accessor(table_id); auto* table_ptr = table(table_id); diff --git a/paddle/fluid/distributed/service/ps_local_server.h b/paddle/fluid/distributed/service/ps_local_server.h index dfbccc70900e3cf10fbb0852a114e400d738e2d6..33b0b5fa796d7571e16a0f79fc6ce4de21b1e7a8 100644 --- a/paddle/fluid/distributed/service/ps_local_server.h +++ b/paddle/fluid/distributed/service/ps_local_server.h @@ -26,9 +26,14 @@ class PsLocalServer : public PSServer { PsLocalServer() {} virtual ~PsLocalServer() {} virtual uint64_t start() { return 0; } - virtual uint64_t start(const std::string& ip, uint32_t port) { return 0; } + virtual uint64_t start(const std::string &ip, uint32_t port) { return 0; } virtual int32_t stop() { return 0; } virtual int32_t port() { return 0; } + virtual int32_t configure( + const PSParameter &config, PSEnvironment &env, size_t server_rank, + const std::vector &server_sub_program = {}) { + return 0; + } private: virtual int32_t initialize() { return 0; } diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index d908c26da9870a93d81c0242ac03e26cfebdb976..a4b811e950a3b56443261ceac37fa658007d519d 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -52,6 +52,9 @@ enum PsCmdID { PS_GRAPH_SAMPLE_NEIGHBOORS = 31; PS_GRAPH_SAMPLE_NODES = 32; PS_GRAPH_GET_NODE_FEAT = 33; + PS_GRAPH_CLEAR = 34; + PS_GRAPH_ADD_GRAPH_NODE = 35; + PS_GRAPH_REMOVE_GRAPH_NODE = 36; } message PsRequestMessage { diff --git a/paddle/fluid/distributed/service/server.h b/paddle/fluid/distributed/service/server.h index 74a8cbe44b144b75f33a9c392ffdc80148a82011..89b089386f501835b7c384477b84f98f94c2a4a9 100644 --- a/paddle/fluid/distributed/service/server.h +++ b/paddle/fluid/distributed/service/server.h @@ -70,7 +70,7 @@ class PSServer { virtual int32_t configure( const PSParameter &config, PSEnvironment &env, size_t server_rank, - const std::vector &server_sub_program = {}) final; + const std::vector &server_sub_program = {}); // return server_ip virtual std::string ip() { return butil::my_ip_cstr(); } diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index dde1f5ae8ee3a1d683c805896a470612de6e2aba..c928ebe90ceb9e6a6c2cd7983d112c9a6f9af6b3 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -9,11 +9,24 @@ set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS $ cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler) set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(ssd_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator) +get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + +set(EXTERN_DEP "") +if(WITH_HETERPS) + set(TABLE_SRC common_sparse_table.cc ssd_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc) + set(EXTERN_DEP rocksdb) +else() + set(TABLE_SRC common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc) +endif() + +cc_library(common_table SRCS ${TABLE_SRC} DEPS ${TABLE_DEPS} +${RPC_DEPS} graph_edge graph_node device_context string_helper +simple_threadpool xxhash generator ${EXTERN_DEP}) set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 0dc99de1bfe82a691fdacb834acd1ad606dcb04b..29bcc04d9c1dfb3f3a5d32040162c4f5c6371672 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -15,12 +15,15 @@ #include "paddle/fluid/distributed/table/common_graph_table.h" #include #include +#include #include #include #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/table/graph/graph_node.h" +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" + namespace paddle { namespace distributed { @@ -35,6 +38,77 @@ std::vector GraphShard::get_batch(int start, int end, int step) { size_t GraphShard::get_size() { return bucket.size(); } +int32_t GraphTable::add_graph_node(std::vector &id_list, + std::vector &is_weight_list) { + size_t node_size = id_list.size(); + std::vector>> batch(task_pool_size_); + for (size_t i = 0; i < node_size; i++) { + size_t shard_id = id_list[i] % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + continue; + } + batch[get_thread_pool_index(id_list[i])].push_back( + {id_list[i], i < is_weight_list.size() ? is_weight_list[i] : false}); + } + std::vector> tasks; + for (size_t i = 0; i < batch.size(); ++i) { + if (!batch[i].size()) continue; + tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int { + for (auto &p : batch[i]) { + size_t index = p.first % this->shard_num - this->shard_start; + this->shards[index].add_graph_node(p.first)->build_edges(p.second); + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + return 0; +} + +int32_t GraphTable::remove_graph_node(std::vector &id_list) { + size_t node_size = id_list.size(); + std::vector> batch(task_pool_size_); + for (size_t i = 0; i < node_size; i++) { + size_t shard_id = id_list[i] % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) continue; + batch[get_thread_pool_index(id_list[i])].push_back(id_list[i]); + } + std::vector> tasks; + for (size_t i = 0; i < batch.size(); ++i) { + if (!batch[i].size()) continue; + tasks.push_back(_shards_task_pool[i]->enqueue([&batch, i, this]() -> int { + for (auto &p : batch[i]) { + size_t index = p % this->shard_num - this->shard_start; + this->shards[index].delete_node(p); + } + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + return 0; +} + +void GraphShard::clear() { + for (size_t i = 0; i < bucket.size(); i++) { + delete bucket[i]; + } + bucket.clear(); + node_location.clear(); +} + +GraphShard::~GraphShard() { clear(); } +void GraphShard::delete_node(uint64_t id) { + auto iter = node_location.find(id); + if (iter == node_location.end()) return; + int pos = iter->second; + delete bucket[pos]; + if (pos != (int)bucket.size() - 1) { + bucket[pos] = bucket.back(); + node_location[bucket.back()->get_id()] = pos; + } + node_location.erase(id); + bucket.pop_back(); +} GraphNode *GraphShard::add_graph_node(uint64_t id) { if (node_location.find(id) == node_location.end()) { node_location[id] = bucket.size(); @@ -79,11 +153,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges( int start = 0, end, index = 0, total_size = 0; res.clear(); std::vector>> tasks; - // std::string temp = ""; - // for(int i = 0;i < shards.size();i++) - // temp+= std::to_string((int)shards[i].get_size()) + " "; - // VLOG(0)<<"range distribution "<enqueue( [this, first, second, i]() -> std::vector { return shards[i].get_ids_by_range(first, second); @@ -106,7 +175,7 @@ int32_t GraphTable::get_nodes_ids_by_ranges( } total_size += shards[i].get_size(); } - for (int i = 0; i < tasks.size(); i++) { + for (size_t i = 0; i < tasks.size(); i++) { auto vec = tasks[i].get(); for (auto &id : vec) { res.push_back(id); @@ -219,7 +288,7 @@ int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { for (auto &shard : shards) { auto bucket = shard.get_bucket(); - for (int i = 0; i < bucket.size(); i++) { + for (size_t i = 0; i < bucket.size(); i++) { bucket[i]->build_sampler(sample_type); } } @@ -238,10 +307,29 @@ Node *GraphTable::find_node(uint64_t id) { uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { return node_id % shard_num % shard_num_per_table % task_pool_size_; } + +uint32_t GraphTable::get_thread_pool_index_by_shard_index( + uint64_t shard_index) { + return shard_index % shard_num_per_table % task_pool_size_; +} + +int32_t GraphTable::clear_nodes() { + std::vector> tasks; + for (size_t i = 0; i < shards.size(); i++) { + tasks.push_back( + _shards_task_pool[get_thread_pool_index_by_shard_index(i)]->enqueue( + [this, i]() -> int { + this->shards[i].clear(); + return 0; + })); + } + for (size_t i = 0; i < tasks.size(); i++) tasks[i].get(); + return 0; +} + int32_t GraphTable::random_sample_nodes(int sample_size, std::unique_ptr &buffer, int &actual_size) { - bool need_feature = false; int total_size = 0; for (int i = 0; i < shards.size(); i++) { total_size += shards[i].get_size(); @@ -281,7 +369,7 @@ int32_t GraphTable::random_sample_nodes(int sample_size, } std::vector> first_half, second_half; int start_index = rand() % total_size; - for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) { + for (size_t i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) { if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size) first_half.push_back({ranges_pos[i] + start_index, ranges_pos[i] + ranges_len[i] + start_index}); @@ -314,31 +402,34 @@ int32_t GraphTable::random_sample_neighboors( uint64_t &node_id = node_ids[idx]; std::unique_ptr &buffer = buffers[idx]; int &actual_size = actual_sizes[idx]; - tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( - [&]() -> int { - Node *node = find_node(node_id); - if (node == nullptr) { - actual_size = 0; - return 0; - } - std::vector res = node->sample_k(sample_size); - actual_size = res.size() * (Node::id_size + Node::weight_size); - int offset = 0; - uint64_t id; - float weight; - char *buffer_addr = new char[actual_size]; - buffer.reset(buffer_addr); - for (int &x : res) { - id = node->get_neighbor_id(x); - weight = node->get_neighbor_weight(x); - memcpy(buffer_addr + offset, &id, Node::id_size); - offset += Node::id_size; - memcpy(buffer_addr + offset, &weight, Node::weight_size); - offset += Node::weight_size; - } - return 0; - })); + int thread_pool_index = get_thread_pool_index(node_id); + auto rng = _shards_task_rng_pool[thread_pool_index]; + + tasks.push_back(_shards_task_pool[thread_pool_index]->enqueue([&]() -> int { + Node *node = find_node(node_id); + + if (node == nullptr) { + actual_size = 0; + return 0; + } + std::vector res = node->sample_k(sample_size, rng); + actual_size = res.size() * (Node::id_size + Node::weight_size); + int offset = 0; + uint64_t id; + float weight; + char *buffer_addr = new char[actual_size]; + buffer.reset(buffer_addr); + for (int &x : res) { + id = node->get_neighbor_id(x); + weight = node->get_neighbor_weight(x); + memcpy(buffer_addr + offset, &id, Node::id_size); + offset += Node::id_size; + memcpy(buffer_addr + offset, &weight, Node::weight_size); + offset += Node::weight_size; + } + return 0; + })); } for (size_t idx = 0; idx < node_num; ++idx) { tasks[idx].get(); @@ -386,7 +477,6 @@ std::pair GraphTable::parse_feature( if (this->feat_id_map.count(fields[0])) { int32_t id = this->feat_id_map[fields[0]]; std::string dtype = this->feat_dtype[id]; - int32_t shape = this->feat_shape[id]; std::vector values(fields.begin() + 1, fields.end()); if (dtype == "feasign") { return std::make_pair( @@ -428,7 +518,6 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, int end = start + (count - 1) * step + 1; tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( [this, i, start, end, step, size]() -> std::vector { - return this->shards[i].get_batch(start - size, end - size, step); })); start += count * step; @@ -462,6 +551,7 @@ int32_t GraphTable::initialize() { _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { _shards_task_pool[i].reset(new ::ThreadPool(1)); + _shards_task_rng_pool.push_back(paddle::framework::GetCPURandomEngine(0)); } server_num = _shard_num; // VLOG(0) << "in init graph table server num = " << server_num; @@ -502,5 +592,5 @@ int32_t GraphTable::initialize() { shards = std::vector(shard_num_per_table, GraphShard(shard_num)); return 0; } -} -}; +} // namespace distributed +}; // namespace paddle diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index b18da82abe61c9695712f542e187ac48fd5edc9d..6ccce44c7ead6983efb57718999f1b36499b34e8 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -36,11 +36,12 @@ class GraphShard { size_t get_size(); GraphShard() {} GraphShard(int shard_num) { this->shard_num = shard_num; } + ~GraphShard(); std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); std::vector get_ids_by_range(int start, int end) { std::vector res; - for (int i = start; i < end && i < bucket.size(); i++) { + for (int i = start; i < end && i < (int)bucket.size(); i++) { res.push_back(bucket[i]->get_id()); } return res; @@ -48,6 +49,8 @@ class GraphShard { GraphNode *add_graph_node(uint64_t id); FeatureNode *add_feature_node(uint64_t id); Node *find_node(uint64_t id); + void delete_node(uint64_t id); + void clear(); void add_neighboor(uint64_t id, uint64_t dst_id, float weight); std::unordered_map get_node_location() { return node_location; @@ -85,6 +88,11 @@ class GraphTable : public SparseTable { int32_t load_nodes(const std::string &path, std::string node_type); + int32_t add_graph_node(std::vector &id_list, + std::vector &is_weight_list); + + int32_t remove_graph_node(std::vector &id_list); + Node *find_node(uint64_t id); virtual int32_t pull_sparse(float *values, @@ -97,6 +105,7 @@ class GraphTable : public SparseTable { return 0; } + virtual int32_t clear_nodes(); virtual void clear() {} virtual int32_t flush() { return 0; } virtual int32_t shrink(const std::string ¶m) { return 0; } @@ -105,6 +114,7 @@ class GraphTable : public SparseTable { return 0; } virtual int32_t initialize_shard() { return 0; } + virtual uint32_t get_thread_pool_index_by_shard_index(uint64_t shard_index); virtual uint32_t get_thread_pool_index(uint64_t node_id); virtual std::pair parse_feature(std::string feat_str); @@ -126,6 +136,8 @@ class GraphTable : public SparseTable { std::string table_type; std::vector> _shards_task_pool; + std::vector> _shards_task_rng_pool; }; } // namespace distributed + }; // namespace paddle diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index 1c315d34abcb6ef73d898da4f71e0659842e5588..e1223face0f54ac782fa41ff16a2db1b08aa413a 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/fluid/distributed/table/common_sparse_table.h" - #include +#include "boost/lexical_cast.hpp" #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" @@ -25,82 +25,12 @@ class ValueBlock; } // namespace distributed } // namespace paddle -#define PSERVER_SAVE_SUFFIX "_txt" - namespace paddle { namespace distributed { -enum SaveMode { all, base, delta }; - -struct Meta { - std::string param; - int shard_id; - std::vector names; - std::vector dims; - uint64_t count; - std::unordered_map dims_map; - - explicit Meta(const std::string& metapath) { - std::ifstream file(metapath); - std::string line; - int num_lines = 0; - while (std::getline(file, line)) { - if (StartWith(line, "#")) { - continue; - } - auto pairs = paddle::string::split_string(line, "="); - PADDLE_ENFORCE_EQ( - pairs.size(), 2, - paddle::platform::errors::InvalidArgument( - "info in %s except k=v, but got %s", metapath, line)); - - if (pairs[0] == "param") { - param = pairs[1]; - } - if (pairs[0] == "shard_id") { - shard_id = std::stoi(pairs[1]); - } - if (pairs[0] == "row_names") { - names = paddle::string::split_string(pairs[1], ","); - } - if (pairs[0] == "row_dims") { - auto dims_strs = - paddle::string::split_string(pairs[1], ","); - for (auto& str : dims_strs) { - dims.push_back(std::stoi(str)); - } - } - if (pairs[0] == "count") { - count = std::stoull(pairs[1]); - } - } - for (int x = 0; x < names.size(); ++x) { - dims_map[names[x]] = dims[x]; - } - } - - Meta(std::string param, int shard_id, std::vector row_names, - std::vector dims, uint64_t count) { - this->param = param; - this->shard_id = shard_id; - this->names = row_names; - this->dims = dims; - this->count = count; - } - - std::string ToString() { - std::stringstream ss; - ss << "param=" << param << "\n"; - ss << "shard_id=" << shard_id << "\n"; - ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n"; - ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n"; - ss << "count=" << count << "\n"; - return ss.str(); - } -}; - -void ProcessALine(const std::vector& columns, const Meta& meta, - std::vector>* values) { +void CommonSparseTable::ProcessALine(const std::vector& columns, + const Meta& meta, const int64_t id, + std::vector>* values) { auto colunmn_size = columns.size(); auto load_values = paddle::string::split_string(columns[colunmn_size - 1], ","); @@ -116,49 +46,83 @@ void ProcessALine(const std::vector& columns, const Meta& meta, "The data format in txt does not meet the field " "requirements defined in meta")); - std::transform(start, end, std::back_inserter(val), - [](std::string va) { return std::stof(va); }); + std::transform(start, end, std::back_inserter(val), [id](std::string va) { + float v = 0.0; + + try { + v = lexical_cast(va); + } catch (boost::bad_lexical_cast& e) { + VLOG(0) << "id: " << id << " get unexpected value: " << va + << " and be reset to: 0.0"; + } + return v; + }); + values->push_back(val); offset += meta.dims[x]; } } -int64_t SaveToText(std::ostream* os, std::shared_ptr block, - const int mode) { - int64_t not_save_num = 0; - for (auto& value : block->values_) { - if (mode == SaveMode::delta && !value.second.need_save_) { - not_save_num++; - continue; - } +void CommonSparseTable::SaveMetaToText(std::ostream* os, + const CommonAccessorParameter& common, + const size_t shard_idx, + const int64_t total) { + // save meta + std::stringstream stream; + stream << "param=" << common.table_name() << "\n"; + stream << "shard_id=" << shard_idx << "\n"; + stream << "row_names=" << paddle::string::join_strings(common.params(), ',') + << "\n"; + stream << "row_dims=" << paddle::string::join_strings(common.dims(), ',') + << "\n"; + stream << "count=" << total << "\n"; + os->write(stream.str().c_str(), sizeof(char) * stream.str().size()); +} - auto* vs = value.second.data_; - std::stringstream ss; - auto id = value.first; - ss << id << "\t" << value.second.count_ << "\t" << value.second.unseen_days_ - << "\t" << value.second.is_entry_ << "\t"; +int64_t CommonSparseTable::SaveValueToText(std::ostream* os, + std::shared_ptr block, + std::shared_ptr<::ThreadPool> pool, + const int mode, int shard_id) { + int64_t save_num = 0; + for (auto& table : block->values_) { + for (auto& value : table) { + if (mode == SaveMode::delta && !value.second->need_save_) { + continue; + } - for (int i = 0; i < block->value_length_; i++) { - ss << vs[i]; - ss << ","; - } + ++save_num; + + std::stringstream ss; + auto* vs = value.second->data_.data(); - ss << "\n"; + auto id = value.first; - os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); + ss << id << "\t" << value.second->count_ << "\t" + << value.second->unseen_days_ << "\t" << value.second->is_entry_ + << "\t"; + + for (int i = 0; i < block->value_length_ - 1; i++) { + ss << std::to_string(vs[i]) << ","; + } - if (mode == SaveMode::base || mode == SaveMode::delta) { - value.second.need_save_ = false; + ss << std::to_string(vs[block->value_length_ - 1]); + ss << "\n"; + + os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); + + if (mode == SaveMode::base || mode == SaveMode::delta) { + value.second->need_save_ = false; + } } } - return block->values_.size() - not_save_num; + return save_num; } -int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, - const int pserver_id, const int pserver_num, - const int local_shard_num, - std::vector>* blocks) { +int64_t CommonSparseTable::LoadFromText( + const std::string& valuepath, const std::string& metapath, + const int pserver_id, const int pserver_num, const int local_shard_num, + std::vector>* blocks) { Meta meta = Meta(metapath); int num_lines = 0; @@ -167,7 +131,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, while (std::getline(file, line)) { auto values = paddle::string::split_string(line, "\t"); - auto id = std::stoull(values[0]); + auto id = lexical_cast(values[0]); if (id % pserver_num != pserver_id) { VLOG(3) << "will not load " << values[0] << " from " << valuepath @@ -179,15 +143,17 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, auto block = blocks->at(shard_id); std::vector> kvalues; - ProcessALine(values, meta, &kvalues); + ProcessALine(values, meta, id, &kvalues); block->Init(id, false); - auto value_instant = block->GetValue(id); + VALUE* value_instant = block->GetValue(id); + if (values.size() == 5) { - value_instant->count_ = std::stoi(values[1]); - value_instant->unseen_days_ = std::stoi(values[2]); - value_instant->is_entry_ = static_cast(std::stoi(values[3])); + value_instant->count_ = lexical_cast(values[1]); + value_instant->unseen_days_ = lexical_cast(values[2]); + value_instant->is_entry_ = + static_cast(lexical_cast(values[3])); } std::vector block_values = block->Get(id, meta.names, meta.dims); @@ -314,16 +280,24 @@ int32_t CommonSparseTable::set_global_lr(float* lr) { int32_t CommonSparseTable::load(const std::string& path, const std::string& param) { + auto begin = GetCurrentUS(); rwlock_->WRLock(); - VLOG(3) << "sparse table load with " << path << " with meta " << param; LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_, &shard_values_); rwlock_->UNLock(); + auto end = GetCurrentUS(); + + auto varname = _config.common().table_name(); + VLOG(0) << "load " << varname << " with value: " << path + << " , meta: " << param + << " using: " << std::to_string((end - begin) / 1e+6) << " seconds"; + return 0; } int32_t CommonSparseTable::save(const std::string& dirname, const std::string& param) { + auto begin = GetCurrentUS(); rwlock_->WRLock(); int mode = std::stoi(param); VLOG(3) << "sparse table save: " << dirname << " mode: " << mode; @@ -336,36 +310,34 @@ int32_t CommonSparseTable::save(const std::string& dirname, VLOG(3) << "save " << varname << " in dir: " << var_store << " begin"; std::vector params(_config.common().params().begin(), _config.common().params().end()); + std::string shard_var_pre = string::Sprintf("%s.block%d", varname, _shard_idx); std::string value_ = string::Sprintf("%s/%s.txt", var_store, shard_var_pre); - std::unique_ptr value_out(new std::ofstream(value_)); + std::unique_ptr vs(new std::ofstream(value_)); int64_t total_ins = 0; for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { // save values - total_ins += SaveToText(value_out.get(), shard_values_[shard_id], mode); + auto shard_save_num = + SaveValueToText(vs.get(), shard_values_[shard_id], + _shards_task_pool[shard_id], mode, shard_id); + total_ins += shard_save_num; } - value_out->close(); + vs->close(); - // save meta - std::stringstream stream; - stream << "param=" << _config.common().table_name() << "\n"; - stream << "shard_id=" << _shard_idx << "\n"; - stream << "row_names=" - << paddle::string::join_strings(_config.common().params(), ',') - << "\n"; - stream << "row_dims=" - << paddle::string::join_strings(_config.common().dims(), ',') << "\n"; - stream << "count=" << total_ins << "\n"; std::string meta_ = string::Sprintf("%s/%s.meta", var_store, shard_var_pre); - std::unique_ptr meta_out(new std::ofstream(meta_)); - meta_out->write(stream.str().c_str(), sizeof(char) * stream.str().size()); - meta_out->close(); - VLOG(3) << "save " << varname << " in dir: " << var_store << " done"; + std::unique_ptr ms(new std::ofstream(meta_)); + SaveMetaToText(ms.get(), _config.common(), _shard_idx, total_ins); + ms->close(); + + auto end = GetCurrentUS(); rwlock_->UNLock(); + VLOG(0) << "save " << varname << " with path: " << value_ + << " using: " << std::to_string((end - begin) / 1e+6) << " seconds"; + return 0; } @@ -373,16 +345,16 @@ std::pair CommonSparseTable::print_table_stat() { int64_t feasign_size = 0; int64_t mf_size = 0; - for (auto& value : shard_values_) { - feasign_size += value->values_.size(); + for (auto& shard : shard_values_) { + for (auto& table : shard->values_) { + feasign_size += table.size(); + } } return {feasign_size, mf_size}; } int32_t CommonSparseTable::pour() { - rwlock_->RDLock(); - std::vector values; std::vector keys; @@ -399,14 +371,11 @@ int32_t CommonSparseTable::pour() { _push_sparse(keys.data(), values.data(), pull_reservoir_.size()); pull_reservoir_.clear(); - rwlock_->UNLock(); return 0; } int32_t CommonSparseTable::pull_sparse(float* pull_values, const PullSparseValue& pull_value) { - rwlock_->RDLock(); - auto shard_num = task_pool_size_; std::vector> tasks(shard_num); @@ -442,7 +411,6 @@ int32_t CommonSparseTable::pull_sparse(float* pull_values, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } @@ -470,7 +438,7 @@ int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values, auto* value = block->InitGet(id); // std::copy_n(value + param_offset_, param_dim_, // pull_values + param_dim_ * offset); - pull_values[offset] = (char*)value; + pull_values[offset] = reinterpret_cast(value); } return 0; @@ -485,7 +453,6 @@ int32_t CommonSparseTable::pull_sparse_ptr(char** pull_values, int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, const float* values, size_t num) { - rwlock_->RDLock(); std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -509,7 +476,6 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } @@ -547,7 +513,6 @@ int32_t CommonSparseTable::push_sparse(const uint64_t* keys, int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, const float** values, size_t num) { - rwlock_->RDLock(); std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -574,14 +539,11 @@ int32_t CommonSparseTable::_push_sparse(const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys, const float* values, size_t num) { - rwlock_->RDLock(); - std::vector> offset_bucket; offset_bucket.resize(task_pool_size_); @@ -613,14 +575,12 @@ int32_t CommonSparseTable::push_sparse_param(const uint64_t* keys, for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { tasks[shard_id].wait(); } - rwlock_->UNLock(); return 0; } int32_t CommonSparseTable::flush() { return 0; } int32_t CommonSparseTable::shrink(const std::string& param) { - rwlock_->WRLock(); int threshold = std::stoi(param); VLOG(3) << "sparse table shrink: " << threshold; @@ -629,7 +589,6 @@ int32_t CommonSparseTable::shrink(const std::string& param) { VLOG(4) << shard_id << " " << task_pool_size_ << " begin shrink"; shard_values_[shard_id]->Shrink(threshold); } - rwlock_->UNLock(); return 0; } diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h index 50c295da53464c8cc1589b27a6dbc233367991b4..ce3cc11686a4807e9de616e2de2dc1d9b1e7c3f9 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/table/common_sparse_table.h @@ -32,11 +32,83 @@ #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/string/string_helper.h" +#define PSERVER_SAVE_SUFFIX ".shard" +using boost::lexical_cast; + namespace paddle { namespace distributed { class SparseOptimizer; +enum SaveMode { all, base, delta }; + +struct Meta { + std::string param; + int shard_id; + std::vector names; + std::vector dims; + uint64_t count; + std::unordered_map dims_map; + + explicit Meta(const std::string& metapath) { + std::ifstream file(metapath); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + if (StartWith(line, "#")) { + continue; + } + auto pairs = paddle::string::split_string(line, "="); + PADDLE_ENFORCE_EQ( + pairs.size(), 2, + paddle::platform::errors::InvalidArgument( + "info in %s except k=v, but got %s", metapath, line)); + + if (pairs[0] == "param") { + param = pairs[1]; + } + if (pairs[0] == "shard_id") { + shard_id = std::stoi(pairs[1]); + } + if (pairs[0] == "row_names") { + names = paddle::string::split_string(pairs[1], ","); + } + if (pairs[0] == "row_dims") { + auto dims_strs = + paddle::string::split_string(pairs[1], ","); + for (auto& str : dims_strs) { + dims.push_back(std::stoi(str)); + } + } + if (pairs[0] == "count") { + count = std::stoull(pairs[1]); + } + } + for (int x = 0; x < names.size(); ++x) { + dims_map[names[x]] = dims[x]; + } + } + + Meta(std::string param, int shard_id, std::vector row_names, + std::vector dims, uint64_t count) { + this->param = param; + this->shard_id = shard_id; + this->names = row_names; + this->dims = dims; + this->count = count; + } + + std::string ToString() { + std::stringstream ss; + ss << "param=" << param << "\n"; + ss << "shard_id=" << shard_id << "\n"; + ss << "row_names=" << paddle::string::join_strings(names, ',') << "\n"; + ss << "row_dims=" << paddle::string::join_strings(dims, ',') << "\n"; + ss << "count=" << count << "\n"; + return ss.str(); + } +}; + class CommonSparseTable : public SparseTable { public: CommonSparseTable() { rwlock_.reset(new framework::RWLock); } @@ -56,9 +128,25 @@ class CommonSparseTable : public SparseTable { virtual int32_t initialize_optimizer(); virtual int32_t initialize_recorder(); - int32_t load(const std::string& path, const std::string& param); + virtual int32_t load(const std::string& path, const std::string& param); + + virtual int32_t save(const std::string& path, const std::string& param); + + void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common, + const size_t shard_idx, const int64_t total); - int32_t save(const std::string& path, const std::string& param); + int64_t SaveValueToText(std::ostream* os, std::shared_ptr block, + std::shared_ptr<::ThreadPool> pool, const int mode, + int shard_id); + + virtual void ProcessALine(const std::vector& columns, + const Meta& meta, const int64_t id, + std::vector>* values); + + virtual int64_t LoadFromText( + const std::string& valuepath, const std::string& metapath, + const int pserver_id, const int pserver_num, const int local_shard_num, + std::vector>* blocks); virtual std::pair print_table_stat(); virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); @@ -89,7 +177,7 @@ class CommonSparseTable : public SparseTable { virtual int32_t _push_sparse(const uint64_t* keys, const float** values, size_t num); - private: + protected: const int task_pool_size_ = 11; std::vector> _shards_task_pool; diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index bb4174bd2c579699e0afbf896a17bcdd42d1ee36..ac11183d192fffcec80dc1d4a586cda95751c6cd 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -26,6 +26,7 @@ #include #include "gflags/gflags.h" +#include "butil/object_pool.h" #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/table/depends/initializers.h" #include "paddle/fluid/distributed/thirdparty/round_robin.h" @@ -48,6 +49,10 @@ namespace distributed { enum Mode { training, infer }; +static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6; +static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1 + << SPARSE_SHARD_BUCKET_NUM_BITS; + struct VALUE { explicit VALUE(size_t length) : length_(length), @@ -55,46 +60,16 @@ struct VALUE { unseen_days_(0), need_save_(false), is_entry_(false) { - data_ = new float[length]; - memset(data_, 0, sizeof(float) * length); - } - - VALUE(const VALUE &value) { - length_ = value.length_; - count_ = value.count_; - unseen_days_ = value.unseen_days_; - need_save_ = value.need_save_; - is_entry_ = value.is_entry_; - data_ = new float[length_]; - memcpy(data_, value.data_, sizeof(float) * length_); - } - - VALUE &operator=(const VALUE &value) { - if (this != &value) { - delete[] data_; - length_ = value.length_; - count_ = value.count_; - unseen_days_ = value.unseen_days_; - need_save_ = value.need_save_; - is_entry_ = value.is_entry_; - - data_ = new float[length_]; - memcpy(data_, value.data_, sizeof(float) * length_); - } - return *this; - } - - ~VALUE() { - delete[] data_; - data_ = nullptr; + data_.resize(length); + memset(data_.data(), 0, sizeof(float) * length); } size_t length_; + std::vector data_; int count_; int unseen_days_; // use to check knock-out bool need_save_; // whether need to save bool is_entry_; // whether knock-in - float *data_; }; inline bool count_entry(VALUE *value, int threshold) { @@ -108,6 +83,7 @@ inline bool probility_entry(VALUE *value, float threshold) { class ValueBlock { public: + typedef typename robin_hood::unordered_map map_type; explicit ValueBlock(const std::vector &value_names, const std::vector &value_dims, const std::vector &value_offsets, @@ -176,12 +152,12 @@ class ValueBlock { const std::vector &value_dims) { auto pts = std::vector(); pts.reserve(value_names.size()); - auto &values = values_.at(id); + auto values = GetValue(id); for (int i = 0; i < static_cast(value_names.size()); i++) { PADDLE_ENFORCE_EQ( value_dims[i], value_dims_[i], platform::errors::InvalidArgument("value dims is not match")); - pts.push_back(values.data_ + + pts.push_back(values->data_.data() + value_offsets_.at(value_idx_.at(value_names[i]))); } return pts; @@ -190,33 +166,45 @@ class ValueBlock { // pull float *Init(const uint64_t &id, const bool with_update = true, const int counter = 1) { - if (!Has(id)) { - values_.emplace(std::make_pair(id, VALUE(value_length_))); - } + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); - auto &value = values_.at(id); + auto &table = values_[bucket]; + auto res = table.find(id); - if (with_update) { - AttrUpdate(&value, counter); + VALUE *value = nullptr; + if (res == table.end()) { + value = butil::get_object(value_length_); + + table[id] = value; + + } else { + value = res->second; } - return value.data_; + if (with_update) { + AttrUpdate(value, counter); + } + return value->data_.data(); } - VALUE *InitGet(const uint64_t &id, const bool with_update = true, const int counter = 1) { - if (!Has(id)) { - values_.emplace(std::make_pair(id, VALUE(value_length_))); - } + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); - auto &value = values_.at(id); + auto &table = values_[bucket]; + auto res = table.find(id); - if (with_update) { - AttrUpdate(&value, counter); + VALUE *value = nullptr; + if (res == table.end()) { + value = butil::get_object(value_length_); + // value = _alloc.acquire(value_length_); + table[id] = value; + } else { + value = (VALUE *)(void *)(res->second); } - - return &value; + return value; } void AttrUpdate(VALUE *value, const int counter) { @@ -229,7 +217,7 @@ class ValueBlock { if (value->is_entry_) { // initialize for (size_t x = 0; x < value_names_.size(); ++x) { - initializers_[x]->GetValue(value->data_ + value_offsets_[x], + initializers_[x]->GetValue(value->data_.data() + value_offsets_[x], value_dims_[x]); } value->need_save_ = true; @@ -243,42 +231,102 @@ class ValueBlock { // dont jude if (has(id)) float *Get(const uint64_t &id) { - auto &value = values_.at(id); - return value.data_; + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + // auto &value = table.at(id); + // return value->data_.data(); + auto res = table.find(id); + VALUE *value = res->second; + return value->data_.data(); } // for load, to reset count, unseen_days - VALUE *GetValue(const uint64_t &id) { return &values_.at(id); } + VALUE *GetValue(const uint64_t &id) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); + return res->second; + } bool GetEntry(const uint64_t &id) { - auto &value = values_.at(id); - return value.is_entry_; + auto value = GetValue(id); + return value->is_entry_; } void SetEntry(const uint64_t &id, const bool state) { - auto &value = values_.at(id); - value.is_entry_ = state; + auto value = GetValue(id); + value->is_entry_ = state; + } + + void erase(uint64_t feasign) { + size_t hash = _hasher(feasign); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto iter = table.find(feasign); + if (iter != table.end()) { + butil::return_object(iter->second); + iter = table.erase(iter); + } } void Shrink(const int threshold) { - for (auto iter = values_.begin(); iter != values_.end();) { - auto &value = iter->second; - value.unseen_days_++; - if (value.unseen_days_ >= threshold) { - iter = values_.erase(iter); - } else { - ++iter; + for (auto &table : values_) { + for (auto iter = table.begin(); iter != table.end();) { + // VALUE* value = (VALUE*)(void*)(iter->second); + VALUE *value = iter->second; + value->unseen_days_++; + if (value->unseen_days_ >= threshold) { + butil::return_object(iter->second); + //_alloc.release(iter->second); + //_alloc.release(value); + iter = table.erase(iter); + } else { + ++iter; + } } } return; } float GetThreshold() { return threshold_; } + size_t compute_bucket(size_t hash) { + if (SPARSE_SHARD_BUCKET_NUM == 1) { + return 0; + } else { + return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS); + } + } + + map_type::iterator end() { + return values_[SPARSE_SHARD_BUCKET_NUM - 1].end(); + } + + map_type::iterator Find(uint64_t id) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { + return end(); + } else { + return got; + } + } private: bool Has(const uint64_t id) { - auto got = values_.find(id); - if (got == values_.end()) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { return false; } else { return true; @@ -286,8 +334,9 @@ class ValueBlock { } public: - robin_hood::unordered_map values_; + map_type values_[SPARSE_SHARD_BUCKET_NUM]; size_t value_length_ = 0; + std::hash _hasher; private: const std::vector &value_names_; @@ -302,4 +351,3 @@ class ValueBlock { } // namespace distributed } // namespace paddle - diff --git a/paddle/fluid/distributed/table/depends/rocksdb_warpper.h b/paddle/fluid/distributed/table/depends/rocksdb_warpper.h new file mode 100644 index 0000000000000000000000000000000000000000..0e25a89cb14d7293045cde871ad2ae0ce1cb5d66 --- /dev/null +++ b/paddle/fluid/distributed/table/depends/rocksdb_warpper.h @@ -0,0 +1,158 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_HETERPS +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace distributed { + +class RocksDBHandler { + public: + RocksDBHandler() {} + ~RocksDBHandler() {} + + static RocksDBHandler* GetInstance() { + static RocksDBHandler handler; + return &handler; + } + + int initialize(const std::string& db_path, const int colnum) { + VLOG(3) << "db path: " << db_path << " colnum: " << colnum; + rocksdb::Options options; + rocksdb::BlockBasedTableOptions bbto; + bbto.block_size = 4 * 1024; + bbto.block_cache = rocksdb::NewLRUCache(64 * 1024 * 1024); + bbto.block_cache_compressed = rocksdb::NewLRUCache(64 * 1024 * 1024); + bbto.cache_index_and_filter_blocks = false; + bbto.filter_policy.reset(rocksdb::NewBloomFilterPolicy(20, false)); + bbto.whole_key_filtering = true; + options.table_factory.reset(rocksdb::NewBlockBasedTableFactory(bbto)); + + options.keep_log_file_num = 100; + options.max_log_file_size = 50 * 1024 * 1024; // 50MB + options.create_if_missing = true; + options.use_direct_reads = true; + options.max_background_flushes = 5; + options.max_background_compactions = 5; + options.base_background_compactions = 10; + options.write_buffer_size = 256 * 1024 * 1024; // 256MB + options.max_write_buffer_number = 8; + options.max_bytes_for_level_base = + options.max_write_buffer_number * options.write_buffer_size; + options.min_write_buffer_number_to_merge = 1; + options.target_file_size_base = 1024 * 1024 * 1024; // 1024MB + options.memtable_prefix_bloom_size_ratio = 0.02; + options.num_levels = 4; + options.max_open_files = -1; + + options.compression = rocksdb::kNoCompression; + options.level0_file_num_compaction_trigger = 8; + options.level0_slowdown_writes_trigger = + 1.8 * options.level0_file_num_compaction_trigger; + options.level0_stop_writes_trigger = + 3.6 * options.level0_file_num_compaction_trigger; + + if (!db_path.empty()) { + std::string rm_cmd = "rm -rf " + db_path; + system(rm_cmd.c_str()); + } + + rocksdb::Status s = rocksdb::DB::Open(options, db_path, &_db); + assert(s.ok()); + _handles.resize(colnum); + for (int i = 0; i < colnum; i++) { + s = _db->CreateColumnFamily(options, "shard_" + std::to_string(i), + &_handles[i]); + assert(s.ok()); + } + LOG(INFO) << "DB initialize success, colnum:" << colnum; + return 0; + } + + int put(int id, const char* key, int key_len, const char* value, + int value_len) { + rocksdb::WriteOptions options; + options.disableWAL = true; + rocksdb::Status s = + _db->Put(options, _handles[id], rocksdb::Slice(key, key_len), + rocksdb::Slice(value, value_len)); + assert(s.ok()); + return 0; + } + + int put_batch(int id, std::vector>& ssd_keys, + std::vector>& ssd_values, int n) { + rocksdb::WriteOptions options; + options.disableWAL = true; + rocksdb::WriteBatch batch(n * 128); + for (int i = 0; i < n; i++) { + batch.Put(_handles[id], + rocksdb::Slice(ssd_keys[i].first, ssd_keys[i].second), + rocksdb::Slice(ssd_values[i].first, ssd_values[i].second)); + } + rocksdb::Status s = _db->Write(options, &batch); + assert(s.ok()); + return 0; + } + + int get(int id, const char* key, int key_len, std::string& value) { + rocksdb::Status s = _db->Get(rocksdb::ReadOptions(), _handles[id], + rocksdb::Slice(key, key_len), &value); + if (s.IsNotFound()) { + return 1; + } + assert(s.ok()); + return 0; + } + + int del_data(int id, const char* key, int key_len) { + rocksdb::WriteOptions options; + options.disableWAL = true; + rocksdb::Status s = + _db->Delete(options, _handles[id], rocksdb::Slice(key, key_len)); + assert(s.ok()); + return 0; + } + + int flush(int id) { + rocksdb::Status s = _db->Flush(rocksdb::FlushOptions(), _handles[id]); + assert(s.ok()); + return 0; + } + + rocksdb::Iterator* get_iterator(int id) { + return _db->NewIterator(rocksdb::ReadOptions(), _handles[id]); + } + + int get_estimate_key_num(uint64_t& num_keys) { + _db->GetAggregatedIntProperty("rocksdb.estimate-num-keys", &num_keys); + return 0; + } + + private: + std::vector _handles; + rocksdb::DB* _db; +}; +} +} +#endif diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc index 816d31b979072c3f1679df1ea75cd9dc75c55b0a..e2311cc307b6057937408c94c0093f3af1f0882e 100644 --- a/paddle/fluid/distributed/table/graph/graph_node.cc +++ b/paddle/fluid/distributed/table/graph/graph_node.cc @@ -113,5 +113,5 @@ void FeatureNode::recover_from_buffer(char* buffer) { feature.push_back(std::string(str)); } } -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/table/graph/graph_node.h index 8ad795ac97b5499c7b10361760f7ac16494c154b..62c101ec02a935b4f29948c1e8c53823592e8fdf 100644 --- a/paddle/fluid/distributed/table/graph/graph_node.h +++ b/paddle/fluid/distributed/table/graph/graph_node.h @@ -15,6 +15,7 @@ #pragma once #include #include +#include #include #include #include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" @@ -33,7 +34,10 @@ class Node { virtual void build_edges(bool is_weighted) {} virtual void build_sampler(std::string sample_type) {} virtual void add_edge(uint64_t id, float weight) {} - virtual std::vector sample_k(int k) { return std::vector(); } + virtual std::vector sample_k( + int k, const std::shared_ptr rng) { + return std::vector(); + } virtual uint64_t get_neighbor_id(int idx) { return 0; } virtual float get_neighbor_weight(int idx) { return 1.; } @@ -59,7 +63,10 @@ class GraphNode : public Node { virtual void add_edge(uint64_t id, float weight) { edges->add_edge(id, weight); } - virtual std::vector sample_k(int k) { return sampler->sample_k(k); } + virtual std::vector sample_k( + int k, const std::shared_ptr rng) { + return sampler->sample_k(k, rng); + } virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } @@ -123,5 +130,5 @@ class FeatureNode : public Node { protected: std::vector feature; }; -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc index 3a680875e3df4a9cd60f8fe1921b877dbb23c8a2..7a46433e3defbd51b68ed9f25e9e92f64b6d1afa 100644 --- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc @@ -14,24 +14,30 @@ #include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" #include +#include #include +#include "paddle/fluid/framework/generator.h" namespace paddle { namespace distributed { void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } -std::vector RandomSampler::sample_k(int k) { +std::vector RandomSampler::sample_k( + int k, const std::shared_ptr rng) { int n = edges->size(); - if (k > n) { + if (k >= n) { k = n; + std::vector sample_result; + for (int i = 0; i < k; i++) { + sample_result.push_back(i); + } + return sample_result; } - struct timespec tn; - clock_gettime(CLOCK_REALTIME, &tn); - srand(tn.tv_nsec); std::vector sample_result; std::unordered_map replace_map; while (k--) { - int rand_int = rand() % n; + std::uniform_int_distribution distrib(0, n - 1); + int rand_int = distrib(*rng); auto iter = replace_map.find(rand_int); if (iter == replace_map.end()) { sample_result.push_back(rand_int); @@ -98,19 +104,23 @@ void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, count = left->count + right->count; } } -std::vector WeightedSampler::sample_k(int k) { - if (k > count) { +std::vector WeightedSampler::sample_k( + int k, const std::shared_ptr rng) { + if (k >= count) { k = count; + std::vector sample_result; + for (int i = 0; i < k; i++) { + sample_result.push_back(i); + } + return sample_result; } std::vector sample_result; float subtract; std::unordered_map subtract_weight_map; std::unordered_map subtract_count_map; - struct timespec tn; - clock_gettime(CLOCK_REALTIME, &tn); - srand(tn.tv_nsec); + std::uniform_real_distribution distrib(0, 1.0); while (k--) { - float query_weight = rand() % 100000 / 100000.0; + float query_weight = distrib(*rng); query_weight *= weight - subtract_weight_map[this]; sample_result.push_back(sample(query_weight, subtract_weight_map, subtract_count_map, subtract)); @@ -146,5 +156,5 @@ int WeightedSampler::sample( subtract_count_map[this]++; return return_idx; } -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h index 1787ab23b04316de9ad0622ff5524bc88bd51fe1..4a75a112697d322a2eb49a57d379889d34b6009f 100644 --- a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h @@ -14,6 +14,8 @@ #pragma once #include +#include +#include #include #include #include "paddle/fluid/distributed/table/graph/graph_edge.h" @@ -24,14 +26,16 @@ class Sampler { public: virtual ~Sampler() {} virtual void build(GraphEdgeBlob *edges) = 0; - virtual std::vector sample_k(int k) = 0; + virtual std::vector sample_k( + int k, const std::shared_ptr rng) = 0; }; class RandomSampler : public Sampler { public: virtual ~RandomSampler() {} virtual void build(GraphEdgeBlob *edges); - virtual std::vector sample_k(int k); + virtual std::vector sample_k(int k, + const std::shared_ptr rng); GraphEdgeBlob *edges; }; @@ -46,7 +50,8 @@ class WeightedSampler : public Sampler { GraphEdgeBlob *edges; virtual void build(GraphEdgeBlob *edges); virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); - virtual std::vector sample_k(int k); + virtual std::vector sample_k(int k, + const std::shared_ptr rng); private: int sample(float query_weight, @@ -54,5 +59,5 @@ class WeightedSampler : public Sampler { std::unordered_map &subtract_count_map, float &subtract); }; -} -} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h deleted file mode 100644 index 3dfe5a6f357a7cd7d79834a20b6411995665f4fa..0000000000000000000000000000000000000000 --- a/paddle/fluid/distributed/table/graph_edge.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -namespace paddle { -namespace distributed { - -class GraphEdgeBlob { - public: - GraphEdgeBlob() {} - virtual ~GraphEdgeBlob() {} - size_t size() { return id_arr.size(); } - virtual void add_edge(uint64_t id, float weight); - uint64_t get_id(int idx) { return id_arr[idx]; } - virtual float get_weight(int idx) { return 1; } - - protected: - std::vector id_arr; -}; - -class WeightedGraphEdgeBlob : public GraphEdgeBlob { - public: - WeightedGraphEdgeBlob() {} - virtual ~WeightedGraphEdgeBlob() {} - virtual void add_edge(uint64_t id, float weight); - virtual float get_weight(int idx) { return weight_arr[idx]; } - - protected: - std::vector weight_arr; -}; -} -} diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc deleted file mode 100644 index 27a2cafaf4f0fec95de818204ebd191a5083e50a..0000000000000000000000000000000000000000 --- a/paddle/fluid/distributed/table/graph_node.cc +++ /dev/null @@ -1,117 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/distributed/table/graph_node.h" -#include -namespace paddle { -namespace distributed { - -GraphNode::~GraphNode() { - if (sampler != nullptr) { - delete sampler; - sampler = nullptr; - } - if (edges != nullptr) { - delete edges; - edges = nullptr; - } -} - -int Node::weight_size = sizeof(float); -int Node::id_size = sizeof(uint64_t); -int Node::int_size = sizeof(int); - -int Node::get_size(bool need_feature) { return id_size + int_size; } - -void Node::to_buffer(char* buffer, bool need_feature) { - memcpy(buffer, &id, id_size); - buffer += id_size; - - int feat_num = 0; - memcpy(buffer, &feat_num, sizeof(int)); -} - -void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); } - -int FeatureNode::get_size(bool need_feature) { - int size = id_size + int_size; // id, feat_num - if (need_feature) { - size += feature.size() * int_size; - for (const std::string& fea : feature) { - size += fea.size(); - } - } - return size; -} - -void GraphNode::build_edges(bool is_weighted) { - if (edges == nullptr) { - if (is_weighted == true) { - edges = new WeightedGraphEdgeBlob(); - } else { - edges = new GraphEdgeBlob(); - } - } -} -void GraphNode::build_sampler(std::string sample_type) { - if (sample_type == "random") { - sampler = new RandomSampler(); - } else if (sample_type == "weighted") { - sampler = new WeightedSampler(); - } - sampler->build(edges); -} -void FeatureNode::to_buffer(char* buffer, bool need_feature) { - memcpy(buffer, &id, id_size); - buffer += id_size; - - int feat_num = 0; - int feat_len; - if (need_feature) { - feat_num += feature.size(); - memcpy(buffer, &feat_num, sizeof(int)); - buffer += sizeof(int); - for (int i = 0; i < feat_num; ++i) { - feat_len = feature[i].size(); - memcpy(buffer, &feat_len, sizeof(int)); - buffer += sizeof(int); - memcpy(buffer, feature[i].c_str(), feature[i].size()); - buffer += feature[i].size(); - } - } else { - memcpy(buffer, &feat_num, sizeof(int)); - } -} -void FeatureNode::recover_from_buffer(char* buffer) { - int feat_num, feat_len; - memcpy(&id, buffer, id_size); - buffer += id_size; - - memcpy(&feat_num, buffer, sizeof(int)); - buffer += sizeof(int); - - feature.clear(); - for (int i = 0; i < feat_num; ++i) { - memcpy(&feat_len, buffer, sizeof(int)); - buffer += sizeof(int); - - char str[feat_len + 1]; - memcpy(str, buffer, feat_len); - buffer += feat_len; - str[feat_len] = '\0'; - feature.push_back(std::string(str)); - } -} -} -} diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h deleted file mode 100644 index c3e8e3ce5b50d06945857ded1db168f84f955c5f..0000000000000000000000000000000000000000 --- a/paddle/fluid/distributed/table/graph_node.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" -namespace paddle { -namespace distributed { - -class Node { - public: - Node() {} - Node(uint64_t id) : id(id) {} - virtual ~Node() {} - static int id_size, int_size, weight_size; - uint64_t get_id() { return id; } - void set_id(uint64_t id) { this->id = id; } - - virtual void build_edges(bool is_weighted) {} - virtual void build_sampler(std::string sample_type) {} - virtual void add_edge(uint64_t id, float weight) {} - virtual std::vector sample_k(int k) { return std::vector(); } - virtual uint64_t get_neighbor_id(int idx) { return 0; } - virtual float get_neighbor_weight(int idx) { return 1.; } - - virtual int get_size(bool need_feature); - virtual void to_buffer(char *buffer, bool need_feature); - virtual void recover_from_buffer(char *buffer); - virtual std::string get_feature(int idx) { return std::string(""); } - virtual void set_feature(int idx, std::string str) {} - virtual void set_feature_size(int size) {} - virtual int get_feature_size() { return 0; } - - protected: - uint64_t id; -}; - -class GraphNode : public Node { - public: - GraphNode() : Node(), sampler(nullptr), edges(nullptr) {} - GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {} - virtual ~GraphNode(); - virtual void build_edges(bool is_weighted); - virtual void build_sampler(std::string sample_type); - virtual void add_edge(uint64_t id, float weight) { - edges->add_edge(id, weight); - } - virtual std::vector sample_k(int k) { return sampler->sample_k(k); } - virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } - virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } - - protected: - Sampler *sampler; - GraphEdgeBlob *edges; -}; - -class FeatureNode : public Node { - public: - FeatureNode() : Node() {} - FeatureNode(uint64_t id) : Node(id) {} - virtual ~FeatureNode() {} - virtual int get_size(bool need_feature); - virtual void to_buffer(char *buffer, bool need_feature); - virtual void recover_from_buffer(char *buffer); - virtual std::string get_feature(int idx) { - if (idx < (int)this->feature.size()) { - return this->feature[idx]; - } else { - return std::string(""); - } - } - - virtual void set_feature(int idx, std::string str) { - if (idx >= (int)this->feature.size()) { - this->feature.resize(idx + 1); - } - this->feature[idx] = str; - } - virtual void set_feature_size(int size) { this->feature.resize(size); } - virtual int get_feature_size() { return this->feature.size(); } - - template - static std::string parse_value_to_bytes(std::vector feat_str) { - T v; - size_t Tsize = sizeof(T) * feat_str.size(); - char buffer[Tsize]; - for (size_t i = 0; i < feat_str.size(); i++) { - std::stringstream ss(feat_str[i]); - ss >> v; - std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); - } - return std::string(buffer, Tsize); - } - - template - static std::vector parse_bytes_to_array(std::string feat_str) { - T v; - std::vector out; - size_t start = 0; - const char *buffer = feat_str.data(); - while (start < feat_str.size()) { - std::memcpy((char *)&v, buffer + start, sizeof(T)); - start += sizeof(T); - out.push_back(v); - } - return out; - } - - protected: - std::vector feature; -}; -} -} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc deleted file mode 100644 index 059a1d64bc392d7ef6936c008bbeec3bef3a5fb9..0000000000000000000000000000000000000000 --- a/paddle/fluid/distributed/table/graph_weighted_sampler.cc +++ /dev/null @@ -1,150 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" -#include -#include -namespace paddle { -namespace distributed { - -void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } - -std::vector RandomSampler::sample_k(int k) { - int n = edges->size(); - if (k > n) { - k = n; - } - struct timespec tn; - clock_gettime(CLOCK_REALTIME, &tn); - srand(tn.tv_nsec); - std::vector sample_result; - std::unordered_map replace_map; - while (k--) { - int rand_int = rand() % n; - auto iter = replace_map.find(rand_int); - if (iter == replace_map.end()) { - sample_result.push_back(rand_int); - } else { - sample_result.push_back(iter->second); - } - - iter = replace_map.find(n - 1); - if (iter == replace_map.end()) { - replace_map[rand_int] = n - 1; - } else { - replace_map[rand_int] = iter->second; - } - --n; - } - return sample_result; -} - -WeightedSampler::WeightedSampler() { - left = nullptr; - right = nullptr; - edges = nullptr; -} - -WeightedSampler::~WeightedSampler() { - if (left != nullptr) { - delete left; - left = nullptr; - } - if (right != nullptr) { - delete right; - right = nullptr; - } -} - -void WeightedSampler::build(GraphEdgeBlob *edges) { - if (left != nullptr) { - delete left; - left = nullptr; - } - if (right != nullptr) { - delete right; - right = nullptr; - } - return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size()); -} - -void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, - int end) { - count = 0; - this->edges = edges; - if (start + 1 == end) { - left = right = nullptr; - idx = start; - count = 1; - weight = edges->get_weight(idx); - - } else { - left = new WeightedSampler(); - right = new WeightedSampler(); - left->build_one(edges, start, start + (end - start) / 2); - right->build_one(edges, start + (end - start) / 2, end); - weight = left->weight + right->weight; - count = left->count + right->count; - } -} -std::vector WeightedSampler::sample_k(int k) { - if (k > count) { - k = count; - } - std::vector sample_result; - float subtract; - std::unordered_map subtract_weight_map; - std::unordered_map subtract_count_map; - struct timespec tn; - clock_gettime(CLOCK_REALTIME, &tn); - srand(tn.tv_nsec); - while (k--) { - float query_weight = rand() % 100000 / 100000.0; - query_weight *= weight - subtract_weight_map[this]; - sample_result.push_back(sample(query_weight, subtract_weight_map, - subtract_count_map, subtract)); - } - return sample_result; -} - -int WeightedSampler::sample( - float query_weight, - std::unordered_map &subtract_weight_map, - std::unordered_map &subtract_count_map, - float &subtract) { - if (left == nullptr) { - subtract_weight_map[this] = weight; - subtract = weight; - subtract_count_map[this] = 1; - return idx; - } - int left_count = left->count - subtract_count_map[left]; - int right_count = right->count - subtract_count_map[right]; - float left_subtract = subtract_weight_map[left]; - int return_idx; - if (right_count == 0 || - left_count > 0 && left->weight - left_subtract >= query_weight) { - return_idx = left->sample(query_weight, subtract_weight_map, - subtract_count_map, subtract); - } else { - return_idx = - right->sample(query_weight - (left->weight - left_subtract), - subtract_weight_map, subtract_count_map, subtract); - } - subtract_weight_map[this] += subtract; - subtract_count_map[this]++; - return return_idx; -} -} -} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h deleted file mode 100644 index cfc341d27c6b766fcee57e8973a4353d4fe93b4e..0000000000000000000000000000000000000000 --- a/paddle/fluid/distributed/table/graph_weighted_sampler.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "paddle/fluid/distributed/table/graph_edge.h" -namespace paddle { -namespace distributed { - -class Sampler { - public: - virtual ~Sampler() {} - virtual void build(GraphEdgeBlob *edges) = 0; - virtual std::vector sample_k(int k) = 0; -}; - -class RandomSampler : public Sampler { - public: - virtual ~RandomSampler() {} - virtual void build(GraphEdgeBlob *edges); - virtual std::vector sample_k(int k); - GraphEdgeBlob *edges; -}; - -class WeightedSampler : public Sampler { - public: - WeightedSampler(); - virtual ~WeightedSampler(); - WeightedSampler *left, *right; - float weight; - int count; - int idx; - GraphEdgeBlob *edges; - virtual void build(GraphEdgeBlob *edges); - virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); - virtual std::vector sample_k(int k); - - private: - int sample(float query_weight, - std::unordered_map &subtract_weight_map, - std::unordered_map &subtract_count_map, - float &subtract); -}; -} -} diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.cc b/paddle/fluid/distributed/table/ssd_sparse_table.cc new file mode 100644 index 0000000000000000000000000000000000000000..5de6de3d2909d670c4bfdabdac37e72fcb125d5e --- /dev/null +++ b/paddle/fluid/distributed/table/ssd_sparse_table.cc @@ -0,0 +1,362 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/distributed/table/ssd_sparse_table.h" + +DEFINE_string(rocksdb_path, "database", "path of sparse table rocksdb file"); + +namespace paddle { +namespace distributed { + +int32_t SSDSparseTable::initialize() { + _shards_task_pool.resize(task_pool_size_); + for (int i = 0; i < _shards_task_pool.size(); ++i) { + _shards_task_pool[i].reset(new ::ThreadPool(1)); + } + + sync = _config.common().sync(); + VLOG(1) << "table " << _config.common().table_name() << " is sync: " << sync; + + _global_lr = new float(1.0); + + auto common = _config.common(); + int size = static_cast(common.params().size()); + + size_t offset = 0; + for (int x = 0; x < size; ++x) { + auto& varname = common.params()[x]; + auto& dim = common.dims()[x]; + + value_idx_[varname] = x; + value_names_.push_back(varname); + value_dims_.push_back(dim); + value_offsets_.push_back(offset); + initializer_attrs_.push_back(common.initializers()[x]); + + if (varname == "Param") { + param_dim_ = dim; + param_offset_ = offset; + } + + offset += dim; + } + + initialize_value(); + initialize_optimizer(); + initialize_recorder(); + _db = paddle::distributed::RocksDBHandler::GetInstance(); + _db->initialize(FLAGS_rocksdb_path, task_pool_size_); + return 0; +} + +int32_t SSDSparseTable::pull_sparse(float* pull_values, + const PullSparseValue& pull_value) { + auto shard_num = task_pool_size_; + std::vector> tasks(shard_num); + + for (int shard_id = 0; shard_id < shard_num; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, shard_num, &pull_value, &pull_values]() -> int { + auto& block = shard_values_[shard_id]; + + std::vector offsets; + pull_value.Fission(shard_id, shard_num, &offsets); + + for (auto& offset : offsets) { + auto feasign = pull_value.feasigns_[offset]; + auto frequencie = pull_value.frequencies_[offset]; + float* embedding = nullptr; + auto iter = block->Find(feasign); + // in mem + if (iter == block->end()) { + embedding = iter->second->data_.data(); + if (pull_value.is_training_) { + block->AttrUpdate(iter->second, frequencie); + } + } else { + // need create + std::string tmp_str(""); + if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t), + tmp_str) > 0) { + embedding = block->Init(feasign, true, frequencie); + } else { + // in db + int data_size = tmp_str.size() / sizeof(float); + int value_size = block->value_length_; + float* db_value = (float*)const_cast(tmp_str.c_str()); + VALUE* value = block->InitGet(feasign); + + // copy to mem + memcpy(value->data_.data(), db_value, + value_size * sizeof(float)); + embedding = db_value; + + // param, count, unseen_day + value->count_ = db_value[value_size]; + value->unseen_days_ = db_value[value_size + 1]; + value->is_entry_ = db_value[value_size + 2]; + if (pull_value.is_training_) { + block->AttrUpdate(value, frequencie); + } + } + } + std::copy_n(embedding + param_offset_, param_dim_, + pull_values + param_dim_ * offset); + } + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + return 0; +} + +int32_t SSDSparseTable::pull_sparse_ptr(char** pull_values, + const uint64_t* keys, size_t num) { + auto shard_num = task_pool_size_; + std::vector> tasks(shard_num); + + std::vector> offset_bucket; + offset_bucket.resize(task_pool_size_); + + for (int x = 0; x < num; ++x) { + auto y = keys[x] % task_pool_size_; + offset_bucket[y].push_back(x); + } + + for (int shard_id = 0; shard_id < shard_num; ++shard_id) { + tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( + [this, shard_id, &keys, &pull_values, &offset_bucket]() -> int { + auto& block = shard_values_[shard_id]; + auto& offsets = offset_bucket[shard_id]; + + for (auto& offset : offsets) { + auto feasign = keys[offset]; + auto iter = block->Find(feasign); + VALUE* value = nullptr; + // in mem + if (iter != block->end()) { + value = iter->second; + } else { + // need create + std::string tmp_str(""); + if (_db->get(shard_id, (char*)&feasign, sizeof(uint64_t), + tmp_str) > 0) { + value = block->InitGet(feasign); + } else { + // in db + int data_size = tmp_str.size() / sizeof(float); + int value_size = block->value_length_; + float* db_value = (float*)const_cast(tmp_str.c_str()); + value = block->InitGet(feasign); + + // copy to mem + memcpy(value->data_.data(), db_value, + value_size * sizeof(float)); + + // param, count, unseen_day + value->count_ = db_value[value_size]; + value->unseen_days_ = db_value[value_size + 1]; + value->is_entry_ = db_value[value_size + 2]; + } + } + pull_values[offset] = (char*)value; + } + return 0; + }); + } + + for (size_t shard_id = 0; shard_id < tasks.size(); ++shard_id) { + tasks[shard_id].wait(); + } + return 0; +} + +int32_t SSDSparseTable::shrink(const std::string& param) { return 0; } + +int32_t SSDSparseTable::update_table() { + int count = 0; + int value_size = shard_values_[0]->value_length_; + int db_size = 3 + value_size; + float tmp_value[db_size]; + + for (size_t i = 0; i < task_pool_size_; ++i) { + auto& block = shard_values_[i]; + + for (auto& table : block->values_) { + for (auto iter = table.begin(); iter != table.end();) { + VALUE* value = iter->second; + if (value->unseen_days_ >= 1) { + tmp_value[value_size] = value->count_; + tmp_value[value_size + 1] = value->unseen_days_; + tmp_value[value_size + 2] = value->is_entry_; + memcpy(tmp_value, value->data_.data(), sizeof(float) * value_size); + _db->put(i, (char*)&(iter->first), sizeof(uint64_t), (char*)tmp_value, + db_size * sizeof(float)); + count++; + + butil::return_object(iter->second); + iter = table.erase(iter); + } else { + ++iter; + } + } + } + _db->flush(i); + } + VLOG(1) << "Table>> update count: " << count; + return 0; +} + +int64_t SSDSparseTable::SaveValueToText(std::ostream* os, + std::shared_ptr block, + std::shared_ptr<::ThreadPool> pool, + const int mode, int shard_id) { + int64_t save_num = 0; + + for (auto& table : block->values_) { + for (auto& value : table) { + if (mode == SaveMode::delta && !value.second->need_save_) { + continue; + } + + ++save_num; + + std::stringstream ss; + auto* vs = value.second->data_.data(); + + auto id = value.first; + + ss << id << "\t" << value.second->count_ << "\t" + << value.second->unseen_days_ << "\t" << value.second->is_entry_ + << "\t"; + + for (int i = 0; i < block->value_length_ - 1; i++) { + ss << std::to_string(vs[i]) << ","; + } + + ss << std::to_string(vs[block->value_length_ - 1]); + ss << "\n"; + + os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); + + if (mode == SaveMode::base || mode == SaveMode::delta) { + value.second->need_save_ = false; + } + } + } + + if (mode != 1) { + int value_size = block->value_length_; + auto* it = _db->get_iterator(shard_id); + + for (it->SeekToFirst(); it->Valid(); it->Next()) { + float* value = (float*)const_cast(it->value().data()); + std::stringstream ss; + ss << *((uint64_t*)const_cast(it->key().data())) << "\t" + << value[value_size] << "\t" << value[value_size + 1] << "\t" + << value[value_size + 2] << "\t"; + for (int i = 0; i < block->value_length_ - 1; i++) { + ss << std::to_string(value[i]) << ","; + } + + ss << std::to_string(value[block->value_length_ - 1]); + ss << "\n"; + + os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); + } + } + + return save_num; +} + +int32_t SSDSparseTable::load(const std::string& path, + const std::string& param) { + rwlock_->WRLock(); + VLOG(3) << "ssd sparse table load with " << path << " with meta " << param; + LoadFromText(path, param, _shard_idx, _shard_num, task_pool_size_, + &shard_values_); + rwlock_->UNLock(); + return 0; +} + +int64_t SSDSparseTable::LoadFromText( + const std::string& valuepath, const std::string& metapath, + const int pserver_id, const int pserver_num, const int local_shard_num, + std::vector>* blocks) { + Meta meta = Meta(metapath); + + int num_lines = 0; + std::ifstream file(valuepath); + std::string line; + + int value_size = shard_values_[0]->value_length_; + int db_size = 3 + value_size; + float tmp_value[db_size]; + + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + auto id = lexical_cast(values[0]); + + if (id % pserver_num != pserver_id) { + VLOG(3) << "will not load " << values[0] << " from " << valuepath + << ", please check id distribution"; + continue; + } + + auto shard_id = id % local_shard_num; + auto block = blocks->at(shard_id); + + std::vector> kvalues; + ProcessALine(values, meta, id, &kvalues); + + block->Init(id, false); + + VALUE* value_instant = block->GetValue(id); + + if (values.size() == 5) { + value_instant->count_ = lexical_cast(values[1]); + value_instant->unseen_days_ = lexical_cast(values[2]); + value_instant->is_entry_ = + static_cast(lexical_cast(values[3])); + } + + std::vector block_values = block->Get(id, meta.names, meta.dims); + auto blas = GetBlas(); + for (int x = 0; x < meta.names.size(); ++x) { + blas.VCOPY(meta.dims[x], kvalues[x].data(), block_values[x]); + } + VLOG(3) << "loading: " << id + << "unseen day: " << value_instant->unseen_days_; + if (value_instant->unseen_days_ >= 1) { + tmp_value[value_size] = value_instant->count_; + tmp_value[value_size + 1] = value_instant->unseen_days_; + tmp_value[value_size + 2] = value_instant->is_entry_; + memcpy(tmp_value, value_instant->data_.data(), + sizeof(float) * value_size); + _db->put(shard_id, (char*)&(id), sizeof(uint64_t), (char*)tmp_value, + db_size * sizeof(float)); + block->erase(id); + } + } + + return 0; +} + +} // namespace ps +} // namespace paddle +#endif diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.h b/paddle/fluid/distributed/table/ssd_sparse_table.h new file mode 100644 index 0000000000000000000000000000000000000000..5e85fa3ce59d13c1f996f00a4b5b7dd9114ed764 --- /dev/null +++ b/paddle/fluid/distributed/table/ssd_sparse_table.h @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/distributed/table/common_sparse_table.h" +#include "paddle/fluid/distributed/table/depends/rocksdb_warpper.h" +#ifdef PADDLE_WITH_HETERPS +namespace paddle { +namespace distributed { +class SSDSparseTable : public CommonSparseTable { + public: + SSDSparseTable() {} + virtual ~SSDSparseTable() {} + + virtual int32_t initialize() override; + + void SaveMetaToText(std::ostream* os, const CommonAccessorParameter& common, + const size_t shard_idx, const int64_t total); + + int64_t SaveValueToText(std::ostream* os, std::shared_ptr block, + std::shared_ptr<::ThreadPool> pool, const int mode, + int shard_id); + + virtual int64_t LoadFromText( + const std::string& valuepath, const std::string& metapath, + const int pserver_id, const int pserver_num, const int local_shard_num, + std::vector>* blocks); + + virtual int32_t load(const std::string& path, const std::string& param); + + // exchange data + virtual int32_t update_table(); + + virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); + + virtual int32_t pull_sparse_ptr(char** pull_values, const uint64_t* keys, + size_t num); + + virtual int32_t flush() override { return 0; } + virtual int32_t shrink(const std::string& param) override; + virtual void clear() override {} + + private: + RocksDBHandler* _db; + int64_t _cache_tk_size; +}; + +} // namespace ps +} // namespace paddle +#endif diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc index 600be954cb59663fff6f867c020248a92e81a151..0f8753c074634189ffd39350425e6c1936569631 100644 --- a/paddle/fluid/distributed/table/table.cc +++ b/paddle/fluid/distributed/table/table.cc @@ -21,6 +21,9 @@ #include "paddle/fluid/distributed/table/common_graph_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h" +#ifdef PADDLE_WITH_HETERPS +#include "paddle/fluid/distributed/table/ssd_sparse_table.h" +#endif #include "paddle/fluid/distributed/table/tensor_accessor.h" #include "paddle/fluid/distributed/table/tensor_table.h" @@ -29,6 +32,9 @@ namespace distributed { REGISTER_PSCORE_CLASS(Table, GraphTable); REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable); +#ifdef PADDLE_WITH_HETERPS +REGISTER_PSCORE_CLASS(Table, SSDSparseTable); +#endif REGISTER_PSCORE_CLASS(Table, SparseGeoTable); REGISTER_PSCORE_CLASS(Table, BarrierTable); REGISTER_PSCORE_CLASS(Table, TensorTable); diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index 81a1ff5eced2bb36b8f917a31de1e214b272bfa3..55fc92c9b57859772e05ebee0f0cb084ddcfa04a 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -36,7 +36,7 @@ class Table { Table() {} virtual ~Table() {} virtual int32_t initialize(const TableParameter &config, - const FsClientParameter &fs_config) final; + const FsClientParameter &fs_config); virtual int32_t pull_dense(float *values, size_t num) = 0; virtual int32_t push_dense(const float *values, size_t num) = 0; @@ -58,7 +58,9 @@ class Table { virtual int32_t push_sparse(const uint64_t *keys, const float *values, size_t num) = 0; virtual int32_t push_sparse(const uint64_t *keys, const float **values, - size_t num){}; + size_t num) { + return 0; + } virtual int32_t push_sparse_param(const uint64_t *keys, const float *values, size_t num) { return 0; @@ -108,7 +110,7 @@ class Table { virtual int32_t save(const std::string &path, const std::string &converter) = 0; - virtual int32_t set_shard(size_t shard_idx, size_t shard_num) final { + virtual int32_t set_shard(size_t shard_idx, size_t shard_num) { _shard_idx = shard_idx; _shard_num = shard_num; return initialize_shard(); @@ -123,7 +125,7 @@ class Table { protected: virtual int32_t initialize() = 0; - virtual int32_t initialize_accessor() final; + virtual int32_t initialize_accessor(); virtual int32_t initialize_shard() = 0; virtual std::string table_dir(const std::string &model_dir) { return paddle::string::format_string("%s/%03d/", model_dir.c_str(), diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index b756c740ac764ce6effc5d885b6eb7d1e775f956..af87e1b6cc61d190cf06b601f05455d8ac976d71 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -1,8 +1,10 @@ set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) +cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor +ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) +cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table +tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index b268bb449e14619048e89c8933dbae7daf66537b..b8630aed02ffe60181ddb6b41810f5bea602b733 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -124,7 +124,6 @@ void testSingleSampleNeighboor( for (auto g : s) { ASSERT_EQ(true, s1.find(g) != s1.end()); } - VLOG(0) << "test single done"; s.clear(); s1.clear(); vs.clear(); @@ -141,6 +140,57 @@ void testSingleSampleNeighboor( } } +void testAddNode( + std::shared_ptr& worker_ptr_) { + worker_ptr_->clear_nodes(0); + int total_num = 270000; + uint64_t id; + std::unordered_set id_set; + for (int i = 0; i < total_num; i++) { + while (id_set.find(id = rand()) != id_set.end()) + ; + id_set.insert(id); + } + std::vector id_list(id_set.begin(), id_set.end()); + std::vector weight_list; + auto status = worker_ptr_->add_graph_node(0, id_list, weight_list); + status.wait(); + std::vector ids[2]; + for (int i = 0; i < 2; i++) { + auto sample_status = + worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); + sample_status.wait(); + } + std::unordered_set id_set_check(ids[0].begin(), ids[0].end()); + for (auto x : ids[1]) id_set_check.insert(x); + ASSERT_EQ(id_set.size(), id_set_check.size()); + for (auto x : id_set) { + ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true); + } + std::vector remove_ids; + for (auto p : id_set_check) { + if (remove_ids.size() == 0) + remove_ids.push_back(p); + else if (remove_ids.size() < total_num / 2 && rand() % 2 == 1) { + remove_ids.push_back(p); + } + } + for (auto p : remove_ids) id_set_check.erase(p); + status = worker_ptr_->remove_graph_node(0, remove_ids); + status.wait(); + for (int i = 0; i < 2; i++) ids[i].clear(); + for (int i = 0; i < 2; i++) { + auto sample_status = + worker_ptr_->random_sample_nodes(0, i, total_num, ids[i]); + sample_status.wait(); + } + std::unordered_set id_set_check1(ids[0].begin(), ids[0].end()); + for (auto x : ids[1]) id_set_check1.insert(x); + ASSERT_EQ(id_set_check1.size(), id_set_check.size()); + for (auto x : id_set_check1) { + ASSERT_EQ(id_set_check.find(x) != id_set_check.end(), true); + } +} void testBatchSampleNeighboor( std::shared_ptr& worker_ptr_) { std::vector>> vs; @@ -527,6 +577,7 @@ void RunBrpcPushSparse() { std::remove(edge_file_name); std::remove(node_file_name); + testAddNode(worker_ptr_); LOG(INFO) << "Run stop_server"; worker_ptr_->stop_server(); LOG(INFO) << "Run finalize_worker"; diff --git a/paddle/fluid/extension/include/ext_all.h b/paddle/fluid/extension/include/ext_all.h index f2b3bcf5191c378af9d550917138f1676ae45eaf..6987b33012f64d6e4d473ffc7ae666c432c65967 100644 --- a/paddle/fluid/extension/include/ext_all.h +++ b/paddle/fluid/extension/include/ext_all.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once -#if !defined(_MSC_VER) && __cplusplus < 199711L -#error C++11 or later compatible compiler is required to use Paddle. +#if !defined(_MSC_VER) && __cplusplus < 201402L +#error C++14 or later compatible compiler is required to use Paddle. #endif #ifdef _WIN32 diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h index 3890631a6f8a9e99948e32cdd3cb8c1e00c2de75..a0816b65a3d15c9cf1384d1b6f18fa79f9199a83 100644 --- a/paddle/fluid/extension/include/ext_dtype.h +++ b/paddle/fluid/extension/include/ext_dtype.h @@ -16,15 +16,14 @@ limitations under the License. */ #include #include -#include "complex128.h" // NOLINT -#include "complex64.h" // NOLINT +#include "complex.h" // NOLINT #include "ext_exception.h" // NOLINT #include "float16.h" // NOLINT namespace paddle { -using complex64 = paddle::platform::complex64; -using complex128 = paddle::platform::complex128; +using complex64 = paddle::platform::complex; +using complex128 = paddle::platform::complex; using float16 = paddle::platform::float16; enum class DataType { diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index 8b2f7cc5bf13c99b80cd365f5c449f3d3b68bdc5..ab98bdc0bfb47e07e5742ac1ee9cebe60f5c7a69 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/custom_tensor_utils.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/transform.h" @@ -238,9 +237,9 @@ template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; -template PD_DLL_DECL Tensor Tensor::copy_to( +template PD_DLL_DECL Tensor Tensor::copy_to>( const PlaceType &target_place) const; -template PD_DLL_DECL Tensor Tensor::copy_to( +template PD_DLL_DECL Tensor Tensor::copy_to>( const PlaceType &target_place) const; template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; @@ -253,10 +252,10 @@ template PD_DLL_DECL uint8_t *Tensor::data() const; template PD_DLL_DECL int8_t *Tensor::data() const; template PD_DLL_DECL int16_t *Tensor::data() const; template PD_DLL_DECL bool *Tensor::data() const; -template PD_DLL_DECL paddle::platform::complex64 * -Tensor::data() const; -template PD_DLL_DECL paddle::platform::complex128 * -Tensor::data() const; +template PD_DLL_DECL paddle::platform::complex + *Tensor::data>() const; +template PD_DLL_DECL paddle::platform::complex + *Tensor::data>() const; template PD_DLL_DECL paddle::platform::float16 * Tensor::data() const; @@ -268,10 +267,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data(); template PD_DLL_DECL int8_t *Tensor::mutable_data(); template PD_DLL_DECL int16_t *Tensor::mutable_data(); template PD_DLL_DECL bool *Tensor::mutable_data(); -template PD_DLL_DECL paddle::platform::complex64 * -Tensor::mutable_data(); -template PD_DLL_DECL paddle::platform::complex128 * -Tensor::mutable_data(); +template PD_DLL_DECL paddle::platform::complex + *Tensor::mutable_data>(); +template PD_DLL_DECL paddle::platform::complex + *Tensor::mutable_data>(); template PD_DLL_DECL paddle::platform::float16 * Tensor::mutable_data(); @@ -289,10 +288,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data( template PD_DLL_DECL int16_t *Tensor::mutable_data( const PlaceType &place); template PD_DLL_DECL bool *Tensor::mutable_data(const PlaceType &place); -template PD_DLL_DECL paddle::platform::complex64 * -Tensor::mutable_data(const PlaceType &place); -template PD_DLL_DECL paddle::platform::complex128 * -Tensor::mutable_data(const PlaceType &place); +template PD_DLL_DECL paddle::platform::complex * +Tensor::mutable_data>(const PlaceType &place); +template PD_DLL_DECL paddle::platform::complex * +Tensor::mutable_data>(const PlaceType &place); template PD_DLL_DECL paddle::platform::float16 * Tensor::mutable_data(const PlaceType &place); @@ -356,13 +355,13 @@ Tensor Tensor::cast(const DataType &target_type) const { dst_type, CastDataType(*tensor, rlt_tensor_, ctx)); break; case framework::proto::VarType::COMPLEX64: - framework::VisitDataType( - dst_type, - CastDataType(*tensor, rlt_tensor_, ctx)); + framework::VisitDataType(dst_type, + CastDataType>( + *tensor, rlt_tensor_, ctx)); break; case framework::proto::VarType::COMPLEX128: framework::VisitDataType(dst_type, - CastDataType( + CastDataType>( *tensor, rlt_tensor_, ctx)); break; case framework::proto::VarType::FP16: diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 24bed277280839627738d755c1b1abc32416aee3..555cd91d242f82d58260e0367613a35444452b14 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -27,6 +27,22 @@ add_subdirectory(fleet) add_subdirectory(io) #ddim lib proto_library(framework_proto SRCS framework.proto) + +proto_library(op_def_proto SRCS op_def.proto DEPS framework_proto) +cc_library(op_def_api SRCS op_def_api.cc DEPS op_def_proto boost) + +FILE(GLOB OP_DEF_FILES ${PADDLE_SOURCE_DIR}/paddle/fluid/operators/compat/*.pbtxt) +FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt + "namespace { \n" + "const std::unordered_map op_def_map = { \n") +foreach(OP_DEF_FILE ${OP_DEF_FILES}) + FILE(READ ${OP_DEF_FILE} OP_DEF_CONTENT) + get_filename_component(OP_NAME ${OP_DEF_FILE} NAME_WE) + FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt + "{\"${OP_NAME}\",R\"(${OP_DEF_CONTENT})\"},\n") +endforeach(OP_DEF_FILE) +FILE(APPEND ${CMAKE_CURRENT_BINARY_DIR}/op_def.pbtxt "{\"\",\"\"}};\n}") + proto_library(heter_service_proto SRCS heter_service.proto) proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto @@ -94,14 +110,22 @@ cc_test(reader_test SRCS reader_test.cc DEPS reader) cc_library(threadpool SRCS threadpool.cc DEPS enforce) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) -cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) +cc_library(var_type_traits SRCS var_type_traits.cc DEPS lod_tensor selected_rows framework_proto) if (WITH_GPU) target_link_libraries(var_type_traits dynload_cuda) endif() cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) +set(BRPC_DEPS "") +if(WITH_PSLIB OR WITH_PSCORE) + set(BRPC_DEPS brpc) + if(WITH_PSLIB_BRPC) + set(BRPC_DEPS pslib_brpc) + endif() +endif() + cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) -cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope) +cc_library(device_worker SRCS device_worker.cc DEPS trainer_desc_proto lod_tensor scope ${BRPC_DEPS}) cc_test(device_worker_test SRCS device_worker_test.cc DEPS device_worker) cc_library(scope_pool SRCS scope_pool.cc DEPS scope) @@ -230,28 +254,35 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry denormal device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) -cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector) +cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector op_registry while_op_helper recurrent_op_helper conditional_block_op_helper) if(WITH_DISTRIBUTE) if(WITH_PSLIB) cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer lod_rank_table feed_fetch_method collective_helper ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper data_feed_proto timer monitor - heter_service_proto pslib_brpc) + heter_service_proto ${BRPC_DEP}) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS + "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + endif() set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) elseif(WITH_PSCORE) cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc - heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc + downpour_worker.cc downpour_worker_opt.cc pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method @@ -265,28 +296,37 @@ if(WITH_DISTRIBUTE) dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method graph_to_program_pass variable_helper timer monitor) endif() elseif(WITH_PSLIB) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS + "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + endif() + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(device_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(hetercpu_worker.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(heterxpu_trainer.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method - graph_to_program_pass variable_helper timer monitor pslib_brpc ) + graph_to_program_pass variable_helper timer monitor ${BRPC_DEP}) else() cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc heterxpu_trainer.cc data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc - heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc + ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method @@ -301,8 +341,14 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS fast_threaded_ssa_graph_executor variable_helper) cc_library(executor_cache SRCS executor_cache.cc DEPS executor) -cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS - conditional_block_op executor) +if(WITH_PSCORE) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS + conditional_block_op executor ${RPC_DEPS}) +else() + cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS + conditional_block_op executor) +endif() cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry @@ -324,10 +370,10 @@ endif (NOT WIN32) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) -cc_library(op_compatible_info SRCS op_compatible_info DEPS string_helper proto_desc) +cc_library(op_compatible_info SRCS op_compatible_info.cc DEPS string_helper proto_desc) cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info proto_desc string_helper glog) -cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer) +cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer) cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer) cc_library(generator SRCS generator.cc DEPS enforce place) @@ -369,36 +415,3 @@ cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) if(WITH_TESTING AND TEST selected_rows_test) set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120) endif() - -##### 2.0 New custom op extension mechanism related ##### - -# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_ -if (WIN32) - set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) - - set(PADDLE_CUSTOM_OP_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) - set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) - - cc_library(paddle_custom_op_shared - SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) - - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) - target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) - - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}) - else() - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}) - endif() - set(PADDLE_CUSTOM_OP_IMPORT_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.lib - CACHE INTERNAL "Paddle custom op import lib") - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.dll - CACHE INTERNAL "Paddle custom op dll") -endif() diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index 66b988ee1f1fb6486423b0d4196c883979ee6fe3..e9e1875765633990d7212c7963effc09c928b7a5 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -208,15 +208,27 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc); class AttrReader { public: - explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {} + explicit AttrReader(const AttributeMap& attrs) + : attrs_(attrs), default_attrs_(nullptr) {} + + AttrReader(const AttributeMap& attrs, const AttributeMap& default_attrs) + : attrs_(attrs), default_attrs_(&default_attrs) {} template inline const T& Get(const std::string& name) const { - PADDLE_ENFORCE_NE(attrs_.count(name), 0, + auto it = attrs_.find(name); + bool found = it != attrs_.end(); + if (!found) { + if (default_attrs_ != nullptr) { + it = default_attrs_->find(name); + found = it != default_attrs_->end(); + } + } + PADDLE_ENFORCE_EQ(found, true, platform::errors::NotFound( "Attribute (%s) should be in AttributeMap.", name)); - Attribute& attr = const_cast(attrs_.at(name)); + Attribute& attr = const_cast(it->second); ExtractAttribute extract_attr(name); T* attr_value = extract_attr(attr); return *attr_value; @@ -224,6 +236,7 @@ class AttrReader { private: const AttributeMap& attrs_; + const AttributeMap* default_attrs_; }; // check whether a value(attribute) fit a certain limit @@ -234,8 +247,8 @@ class GreaterThanChecker { void operator()(const T& value) const { PADDLE_ENFORCE_GT( value, lower_bound_, - platform::errors::OutOfRange( - "Check for attribute value greater than a certain value failed.")); + platform::errors::OutOfRange("Check for attribute value greater than " + "a certain value failed.")); } private: @@ -332,9 +345,9 @@ class TypedAttrChecker { TypedAttrChecker& SetDefault(const T& default_value) { PADDLE_ENFORCE_EQ( default_value_setter_.empty(), true, - platform::errors::AlreadyExists( - "Attribute (%s) has a default value and cannot be set repeatedly.", - attr_name_)); + platform::errors::AlreadyExists("Attribute (%s) has a default value " + "and cannot be set repeatedly.", + attr_name_)); default_value_setter_.push_back(DefaultValueSetter(default_value)); return *this; } @@ -345,8 +358,8 @@ class TypedAttrChecker { return *this; } - void operator()(AttributeMap* attr_map, - bool get_default_value_only = false) const { + void operator()(AttributeMap* attr_map, bool get_default_value_only = false, + bool only_check_exist_value = false) const { if (get_default_value_only) { if (!default_value_setter_.empty()) { attr_map->emplace(attr_name_, default_value_setter_[0]()); @@ -354,21 +367,32 @@ class TypedAttrChecker { return; } - auto it = attr_map->find(attr_name_); - if (it == attr_map->end()) { - // user do not set this attr - PADDLE_ENFORCE_EQ( - default_value_setter_.empty(), false, - platform::errors::InvalidArgument( - "Attribute (%s) is not set correctly.", attr_name_)); - // default_value_setter_ has no more than one element - attr_map->emplace(attr_name_, default_value_setter_[0]()); - } - it = attr_map->find(attr_name_); - ExtractAttribute extract_attr(attr_name_); - T* attr_value = extract_attr(it->second); - for (const auto& checker : value_checkers_) { - checker(*attr_value); + if (only_check_exist_value) { + auto it = attr_map->find(attr_name_); + if (it != attr_map->end()) { + ExtractAttribute extract_attr(attr_name_); + T* attr_value = extract_attr(it->second); + for (const auto& checker : value_checkers_) { + checker(*attr_value); + } + } + } else { + auto it = attr_map->find(attr_name_); + if (it == attr_map->end()) { + // user do not set this attr + PADDLE_ENFORCE_EQ( + default_value_setter_.empty(), false, + platform::errors::InvalidArgument( + "Attribute (%s) is not set correctly.", attr_name_)); + // default_value_setter_ has no more than one element + auto tmp = attr_map->emplace(attr_name_, default_value_setter_[0]()); + it = tmp.first; + } + ExtractAttribute extract_attr(attr_name_); + T* attr_value = extract_attr(it->second); + for (const auto& checker : value_checkers_) { + checker(*attr_value); + } } } @@ -380,7 +404,7 @@ class TypedAttrChecker { // check whether op's all attributes fit their own limits class OpAttrChecker { - typedef std::function AttrChecker; + typedef std::function AttrChecker; public: template @@ -390,18 +414,19 @@ class OpAttrChecker { return *(checker.target>()); } - void Check(AttributeMap* attr_map, bool explicit_only = false) const { + void Check(AttributeMap* attr_map, bool explicit_only = false, + bool only_check_exist_value = false) const { auto checker_num = attr_checkers_.size(); if (explicit_only) checker_num = explicit_checker_num_; for (size_t i = 0; i < checker_num; ++i) { - attr_checkers_[i](attr_map, false); + attr_checkers_[i](attr_map, false, only_check_exist_value); } } - AttributeMap GetAttrsDefaultValuesMap() const { + AttributeMap GetDefaultAttrsMap() const { AttributeMap default_values_map; for (const auto& checker : attr_checkers_) { - checker(&default_values_map, true); + checker(&default_values_map, true, false); } return default_values_map; } @@ -410,15 +435,26 @@ class OpAttrChecker { explicit_checker_num_ = attr_checkers_.size(); } + void InitDefaultAttributeMap() { + for (const auto& checker : attr_checkers_) { + checker(&default_attrs_, true, false); + } + } + + const AttributeMap& GetDefaultAttrMap() const { return default_attrs_; } + private: std::vector attr_checkers_; + AttributeMap default_attrs_; + // in order to improve the efficiency of dynamic graph mode, // we divede the attribute into explicit type and implicit type. // for explicit attribute, we mean the attribute added in the customized // op makers, usually it's defined in the overloaded Make method. // for implicit attribute, we mean the attribute added outside of the Make - // method like "op_role", "op_role_var", and they are useless in dynamic graph + // method like "op_role", "op_role_var", and they are useless in dynamic + // graph // mode size_t explicit_checker_num_; }; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 97d58df6dc5738fd4c0beecd462dbad21480664f..b1c5ff86d19790acb75027d3965bc98e899b7dd8 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -246,7 +246,7 @@ class CustomOperator : public OperatorWithKernel { * it can only be determined at runtime. */ framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { + const framework::ExecutionContext& ctx) const override { return framework::OpKernelType(proto::VarType::RAW, ctx.GetPlace()); } @@ -257,7 +257,7 @@ class CustomOperator : public OperatorWithKernel { */ framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, - const OpKernelType& expected_kernel_type) { + const OpKernelType& expected_kernel_type) const override { return OpKernelType(expected_kernel_type.data_type_, expected_kernel_type.place_, tensor.layout()); } @@ -781,10 +781,12 @@ void RegisterOperatorWithMetaInfo( const imperative::NameVarBaseMap& var_base_map_in, const imperative::NameVarBaseMap& var_base_map_out, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, const std::map& inplace_map) { CustomGradOpMaker maker( type, var_base_map_in, var_base_map_out, attrs, inplace_map, grad_op_name, grad_op_inputs, grad_op_outputs); + maker.SetDygraphDefaultAttrsMap(default_attrs); return maker(); }; diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index a65dcbd55f94630612ce59b4d07b0789aaf7c697..733831263a184f5060cca58c26866ac3350c155c 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -109,9 +109,9 @@ void GroupTestCopy() { TestCopyTensor(); VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu"; TestCopyTensor(); - VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu"; + VLOG(2) << "complex cpu-cpu-gpu-gpu-cpu"; TestCopyTensor(); - VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu"; + VLOG(2) << "complex cpu-cpu-gpu-gpu-cpu"; TestCopyTensor(); VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu"; TestCopyTensor(); @@ -132,9 +132,9 @@ void GroupTestCast() { TestCast(paddle::DataType::FLOAT32); VLOG(2) << "float cast"; TestCast(paddle::DataType::FLOAT32); - VLOG(2) << "complex64 cast"; + VLOG(2) << "complex cast"; TestCast(paddle::DataType::FLOAT32); - VLOG(2) << "complex128 cast"; + VLOG(2) << "complex cast"; TestCast(paddle::DataType::FLOAT32); VLOG(2) << "float16 cast"; TestCast(paddle::DataType::FLOAT16); diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 7d005c9690b9486ff8c693d9c14f83853a016ced..f447a00f37c808bafe99b54af4984af9c2af1cfe 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -26,6 +26,13 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, platform::errors::Unavailable("Currently, model parallelism is only " "supported between CPU and CUDA.")); + // NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync. + if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) { + TensorCopy(in, dst_place, + *platform::DeviceContextPool::Instance().Get(dst_place), out); + return; + } + // NOTE(yy): TransDataDevice should wait for computation of input. if (!platform::is_cuda_pinned_place(in.place())) { platform::DeviceContextPool::Instance().Get(in.place())->Wait(); diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 6f244ee1713597916961ef8dae4d135d9dc88a56..cc4609a740f474efcd1e14ae11a6dca9b79a9c45 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -31,6 +31,11 @@ USE_INT_STAT(STAT_total_feasign_num_in_mem); namespace paddle { namespace framework { +DLManager& global_dlmanager_pool() { + static DLManager manager; + return manager; +} + void RecordCandidateList::ReSize(size_t length) { mutex_.lock(); capacity_ = length; @@ -366,6 +371,10 @@ void InMemoryDataFeed::SetParseInsId(bool parse_ins_id) { template void InMemoryDataFeed::LoadIntoMemory() { #ifdef _LINUX + if (!so_parser_name_.empty()) { + LoadIntoMemoryFromSo(); + return; + } VLOG(3) << "LoadIntoMemory() begin, thread_id=" << thread_id_; std::string filename; while (this->PickOneFile(&filename)) { @@ -408,6 +417,51 @@ void InMemoryDataFeed::LoadIntoMemory() { #endif } +template +void InMemoryDataFeed::LoadIntoMemoryFromSo() { +#ifdef _LINUX + VLOG(3) << "LoadIntoMemoryFromSo() begin, thread_id=" << thread_id_; + + string::LineFileReader reader; + paddle::framework::CustomParser* parser = + global_dlmanager_pool().Load(so_parser_name_, slot_conf_); + + std::string filename; + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + + paddle::framework::ChannelWriter writer(input_channel_); + T instance; + platform::Timer timeline; + timeline.Start(); + + while (1) { + if (!reader.getline(&*(fp_.get()))) { + break; + } else { + const char* str = reader.get(); + ParseOneInstanceFromSo(str, &instance, parser); + } + + writer << std::move(instance); + instance = T(); + } + + writer.Flush(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemoryFromSo() read all lines, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_; + } + VLOG(3) << "LoadIntoMemoryFromSo() end, thread_id=" << thread_id_; +#endif +} + // explicit instantiation template class InMemoryDataFeed; @@ -638,25 +692,34 @@ bool MultiSlotDataFeed::ParseOneInstanceFromPipe( const char* str = reader.get(); std::string line = std::string(str); - // VLOG(3) << line; + char* endptr = const_cast(str); int pos = 0; for (size_t i = 0; i < use_slots_index_.size(); ++i) { int idx = use_slots_index_[i]; int num = strtol(&str[pos], &endptr, 10); - PADDLE_ENFORCE_NE( - num, 0, - platform::errors::InvalidArgument( - "The number of ids can not be zero, you need padding " - "it in data generator; or if there is something wrong with " - "the data, please check if the data contains unresolvable " - "characters.\nplease check this error line: %s, \n Specifically, " - "something wrong happened(the length of this slot's feasign is 0)" - "when we parse the %d th slots." - "Maybe something wrong around this slot" - "\nWe detect the feasign number of this slot is %d, " - "which is illegal.", - str, i, num)); + + if (num <= 0) { + std::stringstream ss; + ss << "\n\nGot unexpected input, maybe something wrong with it.\n"; + ss << "\n----------------------\n"; + ss << "The Origin Input Data:\n"; + ss << "----------------------\n"; + + ss << line << "\n"; + + ss << "\n----------------------\n"; + ss << "Some Possible Errors:\n"; + ss << "----------------------\n"; + ss << "1. The number of ids can not be zero, you need padding.\n"; + ss << "2. The input data contains unresolvable characters.\n"; + ss << "3. We detect the slot " << i << "'s feasign number is " << num + << " which is illegal.\n"; + ss << "\n"; + + PADDLE_THROW(platform::errors::InvalidArgument(ss.str())); + } + if (idx != -1) { (*instance)[idx].Init(all_slots_type_[i]); if ((*instance)[idx].GetType()[0] == 'f') { // float @@ -818,16 +881,23 @@ void MultiSlotInMemoryDataFeed::Init( inductive_shape_index_.resize(all_slot_num); use_slots_.clear(); use_slots_is_dense_.clear(); + slot_conf_.resize(all_slot_num); for (size_t i = 0; i < all_slot_num; ++i) { const auto& slot = multi_slot_desc.slots(i); all_slots_[i] = slot.name(); all_slots_type_[i] = slot.type(); use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1; + + slot_conf_[i].name = slot.name(); + slot_conf_[i].type = slot.type(); + slot_conf_[i].use_slots_index = use_slots_index_[i]; + total_dims_without_inductive_[i] = 1; inductive_shape_index_[i] = -1; if (slot.is_used()) { use_slots_.push_back(all_slots_[i]); use_slots_is_dense_.push_back(slot.is_dense()); + slot_conf_[i].use_slots_is_dense = slot.is_dense(); std::vector local_shape; if (slot.is_dense()) { for (int j = 0; j < slot.shape_size(); ++j) { @@ -860,6 +930,7 @@ void MultiSlotInMemoryDataFeed::Init( } visit_.resize(all_slot_num, false); pipe_command_ = data_feed_desc.pipe_command(); + so_parser_name_ = data_feed_desc.so_parser_name(); finish_init_ = true; input_type_ = data_feed_desc.input_type(); } @@ -878,6 +949,12 @@ void MultiSlotInMemoryDataFeed::GetMsgFromLogKey(const std::string& log_key, *rank = (uint32_t)strtoul(rank_str.c_str(), NULL, 16); } +void MultiSlotInMemoryDataFeed::ParseOneInstanceFromSo(const char* str, + Record* instance, + CustomParser* parser) { + parser->ParseOneInstance(str, instance); +} + bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { #ifdef _LINUX thread_local string::LineFileReader reader; diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index ec79005dfecc1421c4562f0f1cd362dee7550700..04a5b9b4d3adaf16d74bd641a4d60e492eb882fa 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -117,6 +117,94 @@ using PvInstance = PvInstanceObject*; inline PvInstance make_pv_instance() { return new PvInstanceObject(); } +struct SlotConf { + std::string name; + std::string type; + int use_slots_index; + int use_slots_is_dense; +}; + +class CustomParser { + public: + CustomParser() {} + virtual ~CustomParser() {} + virtual void Init(const std::vector& slots) = 0; + virtual void ParseOneInstance(const char* str, Record* instance) = 0; +}; + +typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)(); + +class DLManager { + struct DLHandle { + void* module; + paddle::framework::CustomParser* parser; + }; + + public: + DLManager() {} + + ~DLManager() { +#ifdef _LINUX + std::lock_guard lock(mutex_); + for (auto it = handle_map_.begin(); it != handle_map_.end(); ++it) { + delete it->second.parser; + dlclose(it->second.module); + } +#endif + } + + bool Close(const std::string& name) { +#ifdef _LINUX + auto it = handle_map_.find(name); + if (it == handle_map_.end()) { + return true; + } + delete it->second.parser; + dlclose(it->second.module); +#endif + VLOG(0) << "Not implement in windows"; + return false; + } + + paddle::framework::CustomParser* Load(const std::string& name, + std::vector& conf) { +#ifdef _LINUX + std::lock_guard lock(mutex_); + DLHandle handle; + std::map::iterator it = handle_map_.find(name); + if (it != handle_map_.end()) { + return it->second.parser; + } + + handle.module = dlopen(name.c_str(), RTLD_NOW); + if (handle.module == nullptr) { + VLOG(0) << "Create so of " << name << " fail"; + return nullptr; + } + + CreateParserObjectFunc create_parser_func = + (CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject"); + handle.parser = create_parser_func(); + handle.parser->Init(conf); + handle_map_.insert({name, handle}); + + return handle.parser; +#endif + VLOG(0) << "Not implement in windows"; + return nullptr; + } + + paddle::framework::CustomParser* ReLoad(const std::string& name, + std::vector& conf) { + Close(name); + return Load(name, conf); + } + + private: + std::mutex mutex_; + std::map handle_map_; +}; + class DataFeed { public: DataFeed() { @@ -252,6 +340,8 @@ class DataFeed { bool finish_set_filelist_; bool finish_start_; std::string pipe_command_; + std::string so_parser_name_; + std::vector slot_conf_; std::vector ins_id_vec_; std::vector ins_content_vec_; platform::Place place_; @@ -324,10 +414,13 @@ class InMemoryDataFeed : public DataFeed { virtual void SetEnablePvMerge(bool enable_pv_merge); virtual void SetCurrentPhase(int current_phase); virtual void LoadIntoMemory(); + virtual void LoadIntoMemoryFromSo(); protected: virtual bool ParseOneInstance(T* instance) = 0; virtual bool ParseOneInstanceFromPipe(T* instance) = 0; + virtual void ParseOneInstanceFromSo(const char* str, T* instance, + CustomParser* parser) {} virtual void PutToFeedVec(const std::vector& ins_vec) = 0; int thread_id_; @@ -688,6 +781,8 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { protected: virtual bool ParseOneInstance(Record* instance); virtual bool ParseOneInstanceFromPipe(Record* instance); + virtual void ParseOneInstanceFromSo(const char* str, Record* instance, + CustomParser* parser); virtual void PutToFeedVec(const std::vector& ins_vec); virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id, uint32_t* cmatch, uint32_t* rank); diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto index 8bbbd06e7ef6a42c9671a8c03e7c938cafefffc3..c1149ed7518e7a39dca12c9605f4ac9d6a97d511 100644 --- a/paddle/fluid/framework/data_feed.proto +++ b/paddle/fluid/framework/data_feed.proto @@ -33,4 +33,5 @@ message DataFeedDesc { optional string rank_offset = 6; optional int32 pv_batch_size = 7 [ default = 32 ]; optional int32 input_type = 8 [ default = 0 ]; + optional string so_parser_name = 9; } diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 8ff94b0277c0cb894ec5c324e0bee962004bb6ee..8708d90485af8fffab7a5c04d3c132e1ced82364 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -143,7 +143,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, const Tensor& in, Tensor* out, - platform::Place place) { + platform::Place place, bool always_copy) { PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Input tensor format is invalid. Input tensor should " @@ -177,7 +177,7 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - if (in_format != out_format) { + if ((in_format != out_format) || always_copy) { void* in_data = GetDataFromTensor(in, in_type); std::string key = platform::CreateKey(*dev_ctx, in_tz, in_format, out_format, in_type); diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 238f2d2e67914c7ae1443d09cf915439ebad4dd5..3404ba2db67e5f0e90203d7ee0bb238bb377af0f 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -78,7 +78,8 @@ inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, const Tensor& in, Tensor* out, - platform::Place place); + platform::Place place, + bool always_copy = false); void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, const OpKernelType& expected_kernel_type, diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index c8f73a5469ab32a5734d980010a52a6f72eb6ca8..a16f35dc11b8f1525685fe3499cfdce6f9b86968 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -18,8 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/eigen_ext.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" @@ -27,9 +26,11 @@ limitations under the License. */ namespace paddle { namespace platform { struct bfloat16; -struct complex128; -struct complex64; +template +struct complex; struct float16; +template +struct complex; } // namespace platform } // namespace paddle @@ -50,27 +51,31 @@ struct DataTypeTrait { #define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); -#define _ForEachDataType_(callback) \ - _ForEachDataTypeHelper_(callback, float, FP32); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \ - _ForEachDataTypeHelper_(callback, double, FP64); \ - _ForEachDataTypeHelper_(callback, int, INT32); \ - _ForEachDataTypeHelper_(callback, int64_t, INT64); \ - _ForEachDataTypeHelper_(callback, bool, BOOL); \ - _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ - _ForEachDataTypeHelper_(callback, int16_t, INT16); \ - _ForEachDataTypeHelper_(callback, int8_t, INT8); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128); - -#define _ForEachDataTypeSmall_(callback) \ - _ForEachDataTypeHelper_(callback, float, FP32); \ - _ForEachDataTypeHelper_(callback, double, FP64); \ - _ForEachDataTypeHelper_(callback, int, INT32); \ - _ForEachDataTypeHelper_(callback, int64_t, INT64); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::complex64, COMPLEX64); \ - _ForEachDataTypeHelper_(callback, ::paddle::platform::complex128, COMPLEX128); +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::bfloat16, BF16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex, \ + COMPLEX64); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex, \ + COMPLEX128); + +#define _ForEachDataTypeSmall_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex, \ + COMPLEX64); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::complex, \ + COMPLEX128); // For the use of thrust, as index-type elements can be only integers. #define _ForEachDataTypeTiny_(callback) \ diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 5a716eba8dbe86e37c1ca1758751f04bdd6c651d..888687c06ce9073108ea5439037da966c45cceda 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -119,12 +119,12 @@ void TransComplexToReal(const proto::VarType::Type& dst_type, // complex -> real switch (src_type) { case proto::VarType::COMPLEX64: - framework::VisitDataType(dst_type, - CastDataType(in, out, ctx)); + framework::VisitDataType( + dst_type, CastDataType>(in, out, ctx)); break; case proto::VarType::COMPLEX128: framework::VisitDataType( - dst_type, CastDataType(in, out, ctx)); + dst_type, CastDataType>(in, out, ctx)); break; default: PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/framework/details/nan_inf_utils.h b/paddle/fluid/framework/details/nan_inf_utils.h index 4d7d9afe7019290e44bb6d20ce42784b8631cadd..cf64ccd60f45a40b6c9ca83dcdd473686d03904f 100644 --- a/paddle/fluid/framework/details/nan_inf_utils.h +++ b/paddle/fluid/framework/details/nan_inf_utils.h @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -30,9 +31,28 @@ void CheckVarHasNanOrInf(const std::string& op_type, const std::string& var_name, const platform::Place& place); +void CheckVarHasNanOrInf(const std::string& op_type, + const std::string& var_name, + const framework::Variable* var, + const platform::Place& place); + void CheckOpHasNanOrInf(const framework::OperatorBase& op, const framework::Scope& scope, const platform::Place& place); + +template +void CheckOpHasNanOrInfInDygraph(const std::string& op_type, + const imperative::NameVarMap& op_outs, + platform::Place place) { + for (const auto& pair : op_outs) { + for (const auto& ivar : pair.second) { + auto* var = ivar->MutableVar(); + if (var == nullptr) continue; + CheckVarHasNanOrInf(op_type, ivar->Name(), var, place); + } + } +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 0fdb97db20af992998d94e37263f415a84cd1ba1..30231a1799fd3714646a81bba2afb5de03045850 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -159,10 +159,11 @@ static void PrintNanInf(const T* value, const size_t numel, int print_num, #pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in) #pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \ omp_in) -#pragma omp declare reduction(+ : paddle::platform::complex64 : omp_out += \ - omp_in) -#pragma omp declare reduction(+ : paddle::platform::complex128 : omp_out += \ - omp_in) +#pragma omp declare reduction(+ : paddle::platform::complex < \ + float > : omp_out += omp_in) +#pragma omp declare reduction(+ : paddle::platform::complex < \ + double > : omp_out += omp_in) + #endif template @@ -218,9 +219,9 @@ void CheckNanInf( } template <> -void CheckNanInf( - const paddle::platform::complex64* value, const size_t numel, int print_num, - const std::string& op_type, const std::string& var_name) { +void CheckNanInf>( + const paddle::platform::complex* value, const size_t numel, + int print_num, const std::string& op_type, const std::string& var_name) { float real_sum = 0.0f; #pragma omp parallel for reduction(+ : real_sum) for (size_t i = 0; i < numel; ++i) { @@ -244,9 +245,9 @@ void CheckNanInf( } template <> -void CheckNanInf( - const paddle::platform::complex128* value, const size_t numel, - int print_num, const std::string& op_type, const std::string& var_name) { + void CheckNanInf>> + (const paddle::platform::complex* value, const size_t numel, + int print_num, const std::string& op_type, const std::string& var_name) { double real_sum = 0.0; #pragma omp parallel for reduction(+ : real_sum) for (size_t i = 0; i < numel; ++i) { @@ -268,12 +269,17 @@ void CheckNanInf( op_type)); } } + #endif template <> template void TensorCheckerVisitor::apply( - typename std::enable_if::value>::type*) const { + typename std::enable_if< + std::is_floating_point::value || + std::is_same>::value || + std::is_same>::value>::type*) + const { // use env strategy control in future, -1=print_all. int print_num = 3; CheckNanInf(tensor_.data(), tensor_.numel(), print_num, op_type_, @@ -291,13 +297,12 @@ void tensor_check(const std::string& op_type, } void CheckVarHasNanOrInf(const std::string& op_type, - const framework::Scope& scope, const std::string& var_name, + const framework::Variable* var, const platform::Place& place) { - auto* var = scope.FindVar(var_name); PADDLE_ENFORCE_NOT_NULL( - var, platform::errors::NotFound("In op=%s, can't find var:%s", op_type, - var_name)); + var, platform::errors::NotFound("Cannot find var: `%s` in op `%s`.", + var_name, op_type)); const Tensor* tensor{nullptr}; if (var->IsType()) { @@ -387,6 +392,14 @@ void CheckVarHasNanOrInf(const std::string& op_type, tensor_check(op_type, var_name, *tensor, place); } +void CheckVarHasNanOrInf(const std::string& op_type, + const framework::Scope& scope, + const std::string& var_name, + const platform::Place& place) { + auto* var = scope.FindVar(var_name); + CheckVarHasNanOrInf(op_type, var_name, var, place); +} + bool IsSkipOp(const framework::OperatorBase& op) { if (op_type_nan_inf_white_list().count(op.Type()) != 0) return true; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index 96d1a9fb94927debf8525fdc8b9597f08eeb7129..a9ea336e42545720df3f7226dac51531b26ebfff 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -123,7 +123,11 @@ __global__ void CheckNanInfKernel(const T* value, const size_t numel, template <> template void TensorCheckerVisitor::apply( - typename std::enable_if::value>::type*) const { + typename std::enable_if< + std::is_floating_point::value || + std::is_same>::value || + std::is_same>::value>::type*) + const { int print_num = 3; auto* dev_ctx = reinterpret_cast( diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h index b4459e5a7c1cc6ad6faa9e19f39bff47fe128344..10b7ab0bc9c534faee7be0a20182ad96c4550844 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.h +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h @@ -46,8 +46,12 @@ struct TensorCheckerVisitor { } template - void apply(typename std::enable_if::value>::type* = - 0) const; + void apply( + typename std::enable_if< + std::is_floating_point::value || + std::is_same>::value || + std::is_same>::value>::type* = + 0) const; std::string op_type_; std::string var_name_; diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index df5370e42ee9f3ab9620e95d230f603fcda8e94b..27f55e237f51689bc5dfcc1d5bcc92496aa506cb 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -249,8 +249,10 @@ struct OpInfoFiller { const imperative::NameVarBaseMap& var_base_map_in, const imperative::NameVarBaseMap& var_base_map_out, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, const std::map& inplace_map) { T maker(type, var_base_map_in, var_base_map_out, attrs, inplace_map); + maker.SetDygraphDefaultAttrsMap(default_attrs); return maker(); }; } diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index a49e492e48028b15d724cbdc7c1b5efbc809ddcf..c44bda490bb6f05ae77001de4748bb2b73a88df8 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -29,7 +29,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/executor_gc_helper.h" -#include "paddle/fluid/framework/heter_service.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" @@ -195,6 +195,9 @@ class DeviceWorker { virtual void SetReaderPlace(const paddle::platform::Place& place) { device_reader_->SetPlace(place); } + virtual void SetDeviceContext(platform::DeviceContext* dev_ctx) { + dev_ctx_ = dev_ctx; + } virtual Scope* GetThreadScope() { return thread_scope_; } DataFeed* device_reader_ = nullptr; @@ -221,6 +224,7 @@ class DeviceWorker { int dump_mode_ = 0; int dump_interval_ = 10000; ChannelWriter writer_; + platform::DeviceContext* dev_ctx_ = nullptr; }; class CPUWorkerBase : public DeviceWorker { @@ -440,107 +444,6 @@ class HeterCpuWorker : public HogwildWorker { }; #endif -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ - defined PADDLE_WITH_XPU) && \ - (defined PADDLE_WITH_PSLIB) -class HeterBoxWorker : public HogwildWorker { - public: - HeterBoxWorker() {} - virtual ~HeterBoxWorker() {} - virtual void Initialize(const TrainerDesc& desc); - virtual void TrainFiles(); - virtual void SetNeedDump(bool need_dump_field); - virtual void SetChannelWriter(ChannelObject* queue); - virtual void SetWorkerNum(int num) { worker_num_ = num; } - virtual void CacheProgram(const ProgramDesc& main_program) { - new (&program_) ProgramDesc(main_program); - } - void ProduceTasks() override; - virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } - virtual void SetEvent(const gpuEvent_t event) { event_ = event; } - virtual void TrainFilesWithProfiler() {} - void ResetStat(); - - protected: - std::shared_ptr fleet_ptr_; - void FillSparseValue(std::shared_ptr task, size_t table_id); - void PushGradients(); - void CollectLabelInfo(std::shared_ptr task, size_t table_id); - void AdjustInsWeight(std::shared_ptr task); - void DumpParam(); - void CopySparseTable(); - void CopyDenseTable(); - void CopyDenseVars(); - - private: - int mpi_rank_; - std::mutex mutex_; - std::vector send_var_list_; - int worker_num_; - ProgramDesc program_; - HeterObjectPool object_pool_; - bool need_dump_param_; - std::vector dump_param_; - bool need_to_push_dense_; - bool need_dump_field_; - bool dump_slot_; - bool need_to_push_sparse_; - std::vector dump_fields_; - ChannelWriter writer_; - DownpourWorkerParameter param_; - float scale_datanorm_; - // just save the value in param_ for easy access - std::map label_var_name_; - std::map> sparse_key_names_; - std::map> sparse_value_names_; - std::map> sparse_grad_names_; - std::map> dense_value_names_; - std::map> dense_grad_names_; - platform::Place root_place_; - // actually pushed feasign of each table - std::map> sparse_push_keys_; - - // skipped ops - std::vector skip_ops_; - - std::vector<::std::future> push_sparse_status_; - std::vector<::std::future> push_dense_status_; - - // adjust ins weight - AdjustInsWeightConfig adjust_ins_weight_config_; - std::vector nid_show_; - // check nan and inf during training - std::vector check_nan_var_names_; - // copy table - CopyTableConfig copy_table_config_; - std::map table_dependency_; - std::vector> copy_sparse_tables_; - std::vector> copy_dense_tables_; - std::unordered_map> feasign_set_; - paddle::framework::Channel> pull_queue_; - paddle::framework::Channel> push_queue_; - gpuEvent_t event_; - gpuStream_t copy_stream_; - int batch_cnt_{0}; - std::atomic done_cnt_{0}; - - double total_time_; - double read_time_; - double pack_time_; - double pull_sparse_local_time_; - double op_all_time_; - double xpu_op_time_; - double xpu_wait_time_; - double cpu_op_time_; - double collect_label_time_; - double fill_sparse_time_; - double push_sparse_time_; - double gpu_2_cpu_time_; - double cpu_2_gpu_time_; - uint64_t total_inst_; -}; -#endif - #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) class PSGPUWorker : public HogwildWorker { @@ -619,7 +522,6 @@ class PSGPUWorker : public HogwildWorker { gpuStream_t copy_stream_; int batch_cnt_{0}; std::atomic done_cnt_{0}; - platform::DeviceContext* dev_ctx_ = nullptr; double total_time_; double read_time_; @@ -639,7 +541,7 @@ class PSGPUWorker : public HogwildWorker { #endif #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} @@ -679,6 +581,7 @@ class SectionWorker : public DeviceWorker { void RunUpdate( std::unique_ptr&, std::unordered_map>&); + void PrepareUnusedVar(); protected: int section_id_; @@ -693,6 +596,8 @@ class SectionWorker : public DeviceWorker { std::vector> ops_; std::shared_ptr program_; + std::unordered_map> + unused_vars_; static uint64_t batch_id_; platform::DeviceContext* dev_ctx_ = nullptr; diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index 5780a95343385e984dd4f1d15123b715c1822a9e..b6f87811bbdb813fadd5ac8a20bd7bf55415d01f 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -69,18 +69,13 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorkerOpt); REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker); #endif -#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ - (defined PADDLE_WITH_PSLIB) -REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); -#endif - #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 654b88920acaf68f1ea5b7b1513735f25255b118..a0a2317b44d94b8e74c7f6c1174acef55fe5e00a 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -1,4 +1,5 @@ // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 NVIDIA Corporation. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -47,6 +48,7 @@ message HybridConfig { optional int32 dp_degree = 1 [ default = -1 ]; optional int32 mp_degree = 2 [ default = 1 ]; optional int32 pp_degree = 3 [ default = 1 ]; + optional int32 sharding_degree = 4 [ default = 1 ]; } message AMPConfig { @@ -118,6 +120,16 @@ message ExecutionStrategy { optional bool use_thread_barrier = 4 [ default = false ]; } +message GradientScaleConfig { + // Optional value ['avg', 'sum', 'customized'] + // If avg, loss@grad will be divided by the number of devices, + // that is, the gradient will be accumulated and averaged among + // multiple devices. + // Else if sum, the gradient will accumulated among multiple + // devices. + optional string scale_strategy = 1 [ default = 'avg' ]; +} + message AsyncConfig { optional int32 k_steps = 1 [ default = -1 ]; optional int32 max_merge_var_num = 2 [ default = 1 ]; @@ -141,6 +153,7 @@ message PipelineConfig { message TensorParallelConfig { optional int32 tensor_parallel_degree = 1 [ default = 1 ]; + optional int32 tensor_init_seed = 2 [ default = -1 ]; } message DistributedStrategy { @@ -172,8 +185,12 @@ message DistributedStrategy { optional bool fp16_allreduce = 25 [ default = false ]; optional bool sharding = 26 [ default = false ]; optional float last_comm_group_size_MB = 27 [ default = 1 ]; - optional bool find_unused_parameters = 28 [ default = true ]; + optional bool find_unused_parameters = 28 [ default = false ]; optional bool tensor_parallel = 29 [ default = false ]; + optional bool without_graph_optimization = 30 [ default = false ]; + optional int32 fuse_grad_size_in_num = 31 [ default = 1 ]; + optional bool calc_comm_same_stream = 32 [ default = false ]; + optional bool asp = 33 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; @@ -190,6 +207,7 @@ message DistributedStrategy { optional TensorParallelConfig tensor_parallel_configs = 113; optional BuildStrategy build_strategy = 201; optional ExecutionStrategy execution_strategy = 202; + optional GradientScaleConfig gradient_scale_configs = 203; } message DistributedJobInfo { diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index b99ab6b5a7ff195ef7d659598df88467bb158c6e..f1f5ba7789ea6137800e7fcfe2d404ca2d87845b 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -28,9 +28,17 @@ namespace internal { template static ::DLDataType GetDLDataTypeCode() { ::DLDataType dtype; - if (std::is_same::value || - std::is_same::value || - std::is_floating_point::value) { + if (std::is_same>::value || + std::is_same>::value) { + // The current dlpack library version is v0.2, and does not define + // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set + // dtype.code to 5U directly here. After the dlpack library version being + // upgraded to v0.4, it should be written as follow. + // dtype.code = kDLComplex; + dtype.code = 5U; + } else if (std::is_same::value || + std::is_same::value || + std::is_floating_point::value) { dtype.code = kDLFloat; } else if (std::is_unsigned::value) { dtype.code = kDLUInt; @@ -87,6 +95,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { platform::errors::Unimplemented("platform::NPUPlace is not supported")); } + inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const { + PADDLE_THROW(platform::errors::Unimplemented( + "platform::NPUPinnedPlace is not supported")); + } + inline ::DLContext operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLContext ctx; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index d03437034d62ad0e4249a96d71f5f7544647e704..8265d105accae0b8a009b1798a6c36053b51ab25 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -28,6 +28,11 @@ namespace framework { namespace { // NOLINT template constexpr uint8_t GetDLDataTypeCode() { + if (std::is_same>::value || + std::is_same>::value) { + return static_cast(5); + } + return std::is_same::value || std::is_floating_point::value ? static_cast(kDLFloat) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index e5bfbf4a8f779a4a1baf9f23c894eadd1d1c4902..de007c128d7543c1433426e80abcbd80ee47dee8 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -72,7 +72,7 @@ Executor::~Executor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_); + ClearMKLDNNCache(place_, this); #endif } @@ -169,6 +169,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool force_disable_gc, bool keep_kid_scopes) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); +#ifdef PADDLE_WITH_MKLDNN + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars, keep_kid_scopes); @@ -294,6 +297,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); if (FLAGS_use_mkldnn) EnableMKLDNN(program); +#ifdef PADDLE_WITH_MKLDNN + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif bool has_feed_ops = has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); bool has_fetch_ops = @@ -576,7 +582,6 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { } } } - platform::AttachPointerHashToMKLDNNKey(this, place_); #else LOG(WARNING) << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 7593b60abfffcd9a0a3e9f743930660327c1409e..9c9f29520de439ee209ced19f448bde9905b231b 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -20,14 +20,12 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/trainer.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/framework/executor_cache.h b/paddle/fluid/framework/executor_cache.h index 782018d1cfe109c3a0cb4919969665207dcfbc9e..3beeacb1010d2687ac0dfd58092773f52c4fafdc 100644 --- a/paddle/fluid/framework/executor_cache.h +++ b/paddle/fluid/framework/executor_cache.h @@ -22,8 +22,10 @@ #include #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index c06a3d4a183799c7c8ca130f9ff48e7bff23a3bd..4b7c8c6e3f49bca036a0bf1f367071b273381f01 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -20,8 +20,12 @@ #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" +#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -185,5 +189,91 @@ void DeleteUnusedTensors( } } +static std::vector> CreateOpsFromBlock( + const BlockDesc &block) { + std::vector> ops; + size_t op_num = block.OpSize(); + ops.reserve(op_num); + for (size_t i = 0; i < op_num; ++i) { + auto *op_desc = block.Op(i); + ops.push_back(OpRegistry::CreateOp(*op_desc)); + } + return ops; +} + +std::vector>> GetEagerDeletionCleanVars( + const ProgramDesc &origin_program, + const std::vector &skip_vars) { + ProgramDesc program{origin_program}; + size_t block_num = program.Size(); + PADDLE_ENFORCE_GE(block_num, 1, + platform::errors::PermissionDenied( + "Program should have at least one block")); + + // prepare safe GCs on sub block ops + auto global_block_ops = CreateOpsFromBlock(program.Block(0)); + operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( + program, 0, global_block_ops); + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(program, 0, + global_block_ops); + operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + program, 0, global_block_ops); + + // find the skip vars on each block + std::vector> skip_vars_on_each_block(block_num); + skip_vars_on_each_block[0] = skip_vars; + std::vector found_skip_vars(block_num, false); + found_skip_vars[0] = true; + + const char *kSubBlock = "sub_block"; + const char *kSkipEagerDeletionVars = "skip_eager_deletion_vars"; + + for (size_t i = 0; i < block_num; ++i) { + const auto &block = program.Block(i); + size_t op_num = block.OpSize(); + for (size_t j = 0; j < op_num; ++j) { + auto *op = block.Op(j); + if (!op->HasAttr(kSubBlock) || !op->HasAttr(kSkipEagerDeletionVars)) { + continue; + } + auto sub_block_id = op->GetAttrIfExists(kSubBlock)->ID(); + PADDLE_ENFORCE_GE(sub_block_id, 0, + platform::errors::PermissionDenied( + "sub_block id must be non-negative number")); + PADDLE_ENFORCE_LT(sub_block_id, block_num, + platform::errors::PermissionDenied( + "sub_block id exceeds max block num")); + PADDLE_ENFORCE_EQ( + found_skip_vars[sub_block_id], false, + platform::errors::PermissionDenied( + "there are 2 ops which refer to the same sub_block %d", + sub_block_id)); + + found_skip_vars[sub_block_id] = true; + auto sub_block_skip_vars = + op->GetAttrIfExists>(kSkipEagerDeletionVars); + skip_vars_on_each_block[sub_block_id] = std::move(sub_block_skip_vars); + } + } + + std::vector>> result; + result.reserve(block_num); + for (size_t i = 0; i < block_num; ++i) { + const auto &block = program.Block(i); + const auto block_ops = CreateOpsFromBlock(block); + const auto &block_skip_vars = skip_vars_on_each_block[i]; + auto delete_var_map = GetUnusedVars(block, block_ops, block_skip_vars); + std::vector> block_result; + block_result.reserve(block_ops.size()); + for (const auto &op : block_ops) { + auto &delete_vars = delete_var_map[op.get()]; + std::sort(delete_vars.begin(), delete_vars.end()); // for stable result + block_result.emplace_back(delete_vars); + } + result.emplace_back(std::move(block_result)); + } + return result; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h index e44edc5aa1c810f859942a62763e0c9179885987..886341791bade8697773bac69722f6827d5e33d8 100644 --- a/paddle/fluid/framework/executor_gc_helper.h +++ b/paddle/fluid/framework/executor_gc_helper.h @@ -43,5 +43,11 @@ void DeleteUnusedTensors( &delete_vars_map, GarbageCollector *gc); +// Get the clean vars of GC after each op runs. This function is used for +// analysis statically. +// result is in the format: result[block_idx][op_idx][delete_var_idx] +std::vector>> GetEagerDeletionCleanVars( + const ProgramDesc &program, const std::vector &skip_vars = {}); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 03dd2cff655c063a27f2c4efccd41e3f9e9547de..a9e4691dd0a01544e1d75d3d27dce43585081837 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -1,5 +1,10 @@ if(WITH_PSLIB) - cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib) + if(WITH_PSLIB_BRPC) + set(BRPC_DEPS pslib_brpc) + else() + set(BRPC_DEPS brpc) + endif(WITH_PSLIB_BRPC) + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope ${BRPC_DEPS} pslib) else() cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) endif(WITH_PSLIB) @@ -7,11 +12,11 @@ endif(WITH_PSLIB) if(WITH_HETERPS) if(WITH_NCCL) nv_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps) + DEPS heter_ps ${BRPC_DEPS}) add_subdirectory(heter_ps) elseif(WITH_RCCL) hip_library(ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc - DEPS heter_ps) + DEPS heter_ps ${BRPC_DEPS}) add_subdirectory(heter_ps) endif(WITH_NCCL) else() @@ -39,7 +44,17 @@ else() cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) endif(WITH_GLOO) -cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto) +if(WITH_PSLIB) +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS + "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") +endif() +set_source_files_properties(heter_wrapper.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +endif() + +cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto +device_context heter_service_proto ${BRPC_DEPS}) cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 3cd8b55026e5189f46423163985cf18e4e4fcdad..dfe94cf1eb39ae464916c1626d1541741aaeed31 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -551,16 +551,36 @@ void FleetWrapper::PullSparseVarsSync( for (auto& t : *fea_values) { pull_result_ptr.push_back(t.data()); } - auto status = pslib_ptr_->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); - pull_sparse_status.push_back(std::move(status)); - for (auto& t : pull_sparse_status) { - t.wait(); - auto status = t.get(); - if (status != 0) { - LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; - sleep(sleep_seconds_before_fail_exit_); - exit(-1); + + int32_t cnt = 0; + while (true) { + pull_sparse_status.clear(); + auto status = pslib_ptr_->_worker_ptr->pull_sparse( + pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); + pull_sparse_status.push_back(std::move(status)); + bool flag = true; + for (auto& t : pull_sparse_status) { + t.wait(); + int32_t status = -1; + try { + status = t.get(); + } catch (const std::future_error& e) { + VLOG(0) << "Caught a future_error with code" << e.code() + << ", Message:" << e.what(); + } + if (status != 0) { + VLOG(0) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); + flag = false; + cnt++; + } + if (cnt > 3) { + VLOG(0) << "fleet pull sparse failed, retry 3 times"; + exit(-1); + } + } + if (flag) { + break; } } #endif diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 613b2803637d2d8e388697b6959110da6583a7cc..09f7801b19f988bb7c0948b127b79e6d848629be 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -28,7 +28,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/heter_service.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 6df2cd52bb401d3cc378c2776073471070f1e411..939b5e3099a62a8194cf7202e3fe6fe697ff9210 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -1,10 +1,18 @@ IF(WITH_GPU) - nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) - nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm) + SET(HETERPS_DEPS device_context) + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + SET(HETERPS_DEPS ${HETERPS_DEPS} cub) + endif() + if(WITH_PSCORE) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) + endif() + nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS}) + nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) ENDIF() IF(WITH_ROCM) hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) - hip_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm) + hip_test(test_heter_comm SRCS feature_value.h DEPS heter_comm) hip_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) ENDIF() diff --git a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h index c5647f2cdcffce4a8b53f485b59717eb739266fb..8b04d703c8898b7949c22e45fa9a3f58e9e44e03 100644 --- a/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h @@ -765,7 +765,7 @@ x.second ); unsigned long long get_num_collisions() const { return m_collisions; } void print() { - for (size_type i = 0; i < 10; ++i) { + for (size_type i = 0; i < 5; ++i) { std::cout << i << ": " << m_hashtbl_values[i].first << "," << m_hashtbl_values[i].second << std::endl; } diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.h b/paddle/fluid/framework/fleet/heter_ps/feature_value.h index c3bf33b32c2daf298ddc9af546c4c047bf6e9a6e..f6c4d47ce2d18b6fb89380ce31f06e70e15df768 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.h +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.h @@ -52,25 +52,6 @@ struct FeaturePushValue { float lr_g; float mf_g[MF_DIM]; }; -// class DownpourFixedFeatureValue { -// public: -// DownpourFixedFeatureValue() {} -// ~DownpourFixedFeatureValue() {} -// float* data() { -// return _data.data(); -// } -// size_t size() { -// return _data.size(); -// } -// void resize(size_t size) { -// _data.resize(size); -// } -// void shrink_to_fit() { -// _data.shrink_to_fit(); -// } -// private: -// std::vector _data; -// }; } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index 089130f6da8c734d3e12b06e734089f8a523a24d..3782e14ad41a5ed6ce5ef1eb0788842d03ecddc7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -17,16 +17,16 @@ limitations under the License. */ #include #include #include -#ifdef PADDLE_WTIH_PSLIB +#ifdef PADDLE_WITH_PSLIB #include "common_value.h" // NOLINT #endif #ifdef PADDLE_WITH_PSCORE +#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" #endif #include "thrust/pair.h" //#include "cudf/concurrent_unordered_map.cuh.h" #include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h" #ifdef PADDLE_WITH_HETERPS -#include "paddle/fluid/distributed/table/depends/large_scale_kv.h" #include "paddle/fluid/platform/type_defs.h" namespace paddle { diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 1b4205e3c38fe27419c4ba42e6950b581db62a99..a2e09b7e08132f990628b631aa0730a6a162add7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -115,7 +115,7 @@ void HeterComm::init_path() { path_.resize(total_gpu); if (!topo_aware_) { - VLOG(1) << "init path without topo aware"; + VLOG(3) << "init path without topo aware"; for (int i = 0; i < total_gpu; ++i) { path_[i].resize(total_gpu); for (int j = 0; j < total_gpu; ++j) { @@ -130,7 +130,7 @@ void HeterComm::init_path() { } } } else { - VLOG(1) << "init path with topo aware"; + VLOG(3) << "init path with topo aware"; for (int i = 0; i < total_gpu; ++i) { path_[i].resize(total_gpu); for (int j = 0; j < total_gpu; ++j) { diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index 7e82a8e014fd3cb33b706c9fc5c1e671392e05a7..362877aa1604e001acca26dab2cc7c0f1379e12b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -23,30 +23,6 @@ limitations under the License. */ namespace paddle { namespace framework { -__device__ double cuda_double_random(unsigned long long seed) { - // copy from MurmurHash3 - seed ^= seed >> 33; - seed *= 0xff51afd7ed558ccd; - seed ^= seed >> 33; - seed *= 0xc4ceb9fe1a85ec53; - seed ^= seed >> 33; - return ((double)seed / 18446744073709551615.0); -} - -__device__ float cuda_normal_random(unsigned long long idx) { - static double pi = 3.1415926897932384; - unsigned long long x = clock64() + idx; - double x1, x2, res; - while (1) { - x1 = cuda_double_random(x); - x2 = cuda_double_random(x + 33); - res = sqrt(-2.0 * log(x1)) * cos(2.0 * pi * x2); - if (-10 < res && res < 10) break; - x += 207; - } - return res; -} - template class Optimizer { public: @@ -95,11 +71,12 @@ class Optimizer { } __device__ void update_value(ValType& val, const GradType& grad) { val.slot = grad.slot; - ; val.show += grad.show; val.clk += grad.clk; + val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + + optimizer_config::clk_coeff * grad.clk; - update_lr(val.lr, val.lr_g2sum, grad.lr_g, 1.0); + update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show); if (val.mf_size == 0) { if (optimizer_config::mf_create_thresholds <= @@ -116,7 +93,7 @@ class Optimizer { } } } else { - update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, 1.0); + update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show); } } }; diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h index d513728d205398378383a7c0996af2f799f83673..55d0fc561c574dc62e5eeed7502ccaa02946bc8b 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h @@ -16,15 +16,16 @@ limitations under the License. */ namespace optimizer_config { -__constant__ float mf_create_thresholds = 0; __constant__ float nonclk_coeff = 0.1; __constant__ float clk_coeff = 1; + __constant__ float min_bound = -10; __constant__ float max_bound = 10; __constant__ float learning_rate = 0.05; __constant__ float initial_g2sum = 3.0; -__constant__ float initial_range = 1e-4; +__constant__ float initial_range = 0; +__constant__ float mf_create_thresholds = 10; __constant__ float mf_learning_rate = 0.05; __constant__ float mf_initial_g2sum = 3.0; __constant__ float mf_initial_range = 1e-4; diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h index 871d2e251b41016d548fa1e257560aca9db030d7..4e529de077593777c1ab326db395febaefb9564a 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.h +++ b/paddle/fluid/framework/fleet/heter_wrapper.h @@ -25,6 +25,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_PSLIB #include "paddle/fluid/framework/heter_service.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable_helper.h" diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 67ff6b6acaefb26adc1389559a763b98f41a533a..f8dfccf58ff960c0ecc006951fb1f507587255e7 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -40,8 +40,7 @@ namespace framework { std::shared_ptr PSGPUWrapper::s_instance_ = NULL; bool PSGPUWrapper::is_initialized_ = false; -void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, - uint64_t table_id, int feature_dim) { +void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin"; platform::Timer timeline; timeline.Start(); @@ -68,8 +67,6 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, thread_keys_.resize(thread_keys_thread_num_); for (int i = 0; i < thread_keys_thread_num_; i++) { thread_keys_[i].resize(thread_keys_shard_num_); - for (int j = 0; j < thread_keys_shard_num_; j++) { - } } const std::deque& vec_data = input_channel->GetData(); size_t total_len = vec_data.size(); @@ -139,17 +136,16 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, local_ptr[i].resize(local_keys[i].size()); } timeline.Start(); - auto ptl_func = [this, &local_keys, &local_ptr, &table_id, - &fleet_ptr](int i) { + auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); #ifdef PADDLE_WITH_PSLIB auto tt = fleet_ptr->pslib_ptr_->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), table_id, + reinterpret_cast(local_ptr[i].data()), this->table_id_, local_keys[i].data(), key_size); #endif #ifdef PADDLE_WITH_PSCORE auto tt = fleet_ptr->_worker_ptr->pull_sparse_ptr( - reinterpret_cast(local_ptr[i].data()), table_id, + reinterpret_cast(local_ptr[i].data()), this->table_id_, local_keys[i].data(), key_size); #endif tt.wait(); @@ -255,7 +251,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, } } #endif - VLOG(1) << "GpuPs build hbmps done"; + VLOG(3) << "GpuPs build hbmps done"; device_mutex[dev]->unlock(); } @@ -272,11 +268,8 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, << " seconds."; } -void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { +void PSGPUWrapper::BuildGPUTask(std::shared_ptr gpu_task) { int device_num = heter_devices_.size(); - std::shared_ptr gpu_task = gpu_task_pool_.Get(); - gpu_task->Reset(); - BuildTask(gpu_task, table_id, feature_dim); platform::Timer timeline; timeline.Start(); @@ -291,15 +284,21 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { delete HeterPs_; HeterPs_ = nullptr; } + if (size_max <= 0) { + VLOG(1) << "Skip build gpu ps cause feasign nums = " << size_max; + return; + } std::vector threads(device_num); HeterPs_ = HeterPsBase::get_instance(size_max, resource_); HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); auto build_func = [this, &gpu_task, &feature_keys_count](int i) { - std::cout << "building table: " << i << std::endl; + VLOG(3) << "building table: " << i; this->HeterPs_->build_ps(i, gpu_task->device_keys_[i].data(), gpu_task->device_values_[i].data(), feature_keys_count[i], 500000, 2); - HeterPs_->show_one_table(i); + if (feature_keys_count[i] > 0) { + HeterPs_->show_one_table(i); + } }; for (size_t i = 0; i < threads.size(); i++) { threads[i] = std::thread(build_func, i); @@ -310,7 +309,109 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { timeline.Pause(); VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; - gpu_task_pool_.Push(gpu_task); +} + +void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { + platform::Timer timer; + VLOG(3) << "Begin LoadIntoMemory(), dataset[" << dataset_ << "]"; + timer.Start(); + dataset_->LoadIntoMemory(); + timer.Pause(); + VLOG(0) << "LoadIntoMemory cost: " << timer.ElapsedSec() << "s"; + + // local shuffle + if (is_shuffle) { + dataset_->LocalShuffle(); + } + + std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); + data_ready_channel_->Put(gpu_task); + VLOG(3) << "End LoadIntoMemory(), dataset[" << dataset_ << "]"; +} + +void PSGPUWrapper::start_build_thread() { + running_ = true; + VLOG(3) << "start build CPU&GPU ps thread."; + build_cpu_threads_ = std::thread([this] { build_cpu_thread(); }); + build_gpu_threads_ = std::thread([this] { build_gpu_thread(); }); +} + +void PSGPUWrapper::build_cpu_thread() { + while (running_) { + std::shared_ptr gpu_task = nullptr; + if (!data_ready_channel_->Get(gpu_task)) { + continue; + } + VLOG(3) << "thread BuildTask start."; + platform::Timer timer; + timer.Start(); + // build cpu ps data process + BuildTask(gpu_task); + timer.Pause(); + VLOG(1) << "thread BuildTask end, cost time: " << timer.ElapsedSec() << "s"; + buildcpu_ready_channel_->Put(gpu_task); + } + VLOG(3) << "build cpu thread end"; +} + +void PSGPUWrapper::build_gpu_thread() { + while (running_) { + std::shared_ptr gpu_task = nullptr; + if (!gpu_free_channel_->Get(gpu_task)) { + continue; + } + if (!buildcpu_ready_channel_->Get(gpu_task)) { + continue; + } + VLOG(3) << "thread BuildGPUTask start."; + platform::Timer timer; + timer.Start(); + BuildGPUTask(gpu_task); + timer.Pause(); + VLOG(1) << "thread BuildGPUTask end, cost time: " << timer.ElapsedSec() + << "s"; + + gpu_task_pool_.Push(gpu_task); + train_ready_channel_->Put(gpu_task); + } + VLOG(3) << "build gpu thread end"; +} + +void PSGPUWrapper::BeginPass() { + platform::Timer timer; + timer.Start(); + if (current_task_) { + PADDLE_THROW( + platform::errors::Fatal("[BeginPass] current task is not ended.")); + } + // load+build done + if (!train_ready_channel_->Get(current_task_)) { + PADDLE_THROW(platform::errors::Fatal("train_ready_channel_ failed.")); + } + timer.Pause(); + VLOG(1) << "BeginPass end, cost time: " << timer.ElapsedSec() << "s"; +} + +void PSGPUWrapper::EndPass() { + if (!current_task_) { + PADDLE_THROW( + platform::errors::Fatal("[EndPass] current task has been ended.")); + } + platform::Timer timer; + timer.Start(); + size_t keysize_max = 0; + // in case of feasign_num = 0, skip dump_to_cpu + for (size_t i = 0; i < heter_devices_.size(); i++) { + keysize_max = std::max(keysize_max, current_task_->device_keys_[i].size()); + } + if (keysize_max != 0) { + HeterPs_->end_pass(); + } + current_task_ = nullptr; + gpu_free_channel_->Put(current_task_); + timer.Pause(); + VLOG(1) << "EndPass end, cost time: " << timer.ElapsedSec() << "s"; } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index cfb23d1be2acfed0a878cb3bffa241afa2cf3de8..2bbe595419094567eb991a042ca41d80d3202926 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -82,9 +82,33 @@ class PSGPUWrapper { const int hidden_size, const int64_t total_length, const int batch_size); - void BuildGPUPS(const uint64_t table_id, int feature_dim); - void BuildTask(std::shared_ptr gpu_task, uint64_t table_id, - int feature_dim); + void BuildGPUTask(std::shared_ptr gpu_task); + void BuildTask(std::shared_ptr gpu_task); + void LoadIntoMemory(bool is_shuffle); + void BeginPass(); + void EndPass(); + void start_build_thread(); + void build_cpu_thread(); + void build_gpu_thread(); + + void Finalize() { + VLOG(3) << "PSGPUWrapper Begin Finalize."; + if (s_instance_ == nullptr) { + return; + } + data_ready_channel_->Close(); + buildcpu_ready_channel_->Close(); + gpu_free_channel_->Close(); + train_ready_channel_->Close(); + running_ = false; + VLOG(3) << "begin stop build_cpu_threads_"; + build_cpu_threads_.join(); + VLOG(3) << "begin stop build_gpu_threads_"; + build_gpu_threads_.join(); + s_instance_ = nullptr; + VLOG(3) << "PSGPUWrapper Finalize Finished."; + } + void InitializeGPU(const std::vector& dev_ids) { if (s_instance_ != NULL && is_initialized_ == false) { VLOG(3) << "PSGPUWrapper Begin InitializeGPU"; @@ -129,6 +153,24 @@ class PSGPUWrapper { #endif } heter_devices_ = dev_ids; + data_ready_channel_->Open(); + data_ready_channel_->SetCapacity(3); + buildcpu_ready_channel_->Open(); + buildcpu_ready_channel_->SetCapacity(3); + gpu_free_channel_->Open(); + gpu_free_channel_->SetCapacity(1); + train_ready_channel_->Open(); + train_ready_channel_->SetCapacity(1); + + current_task_ = nullptr; + gpu_free_channel_->Put(current_task_); + + table_id_ = 1; +#ifdef PADDLE_WITH_PSLIB + table_id_ = 0; +#endif + // start build cpu&gpu ps thread + start_build_thread(); } } @@ -206,7 +248,6 @@ class PSGPUWrapper { slot_vector_ = slot_vector; } - void EndPass() { HeterPs_->end_pass(); } void ShowOneTable(int index) { HeterPs_->show_one_table(index); } private: @@ -222,6 +263,7 @@ class PSGPUWrapper { std::vector slot_vector_; int multi_node_{0}; int node_size_; + uint64_t table_id_; std::vector inner_comms_; std::vector inter_comms_; std::vector inter_ncclids_; @@ -233,6 +275,27 @@ class PSGPUWrapper { int thread_keys_shard_num_ = 37; uint64_t max_fea_num_per_pass_ = 5000000000; + std::shared_ptr< + paddle::framework::ChannelObject>> + data_ready_channel_ = + paddle::framework::MakeChannel>(); + std::shared_ptr< + paddle::framework::ChannelObject>> + buildcpu_ready_channel_ = + paddle::framework::MakeChannel>(); + std::shared_ptr< + paddle::framework::ChannelObject>> + gpu_free_channel_ = + paddle::framework::MakeChannel>(); + std::shared_ptr< + paddle::framework::ChannelObject>> + train_ready_channel_ = + paddle::framework::MakeChannel>(); + std::shared_ptr current_task_ = nullptr; + std::thread build_cpu_threads_; + std::thread build_gpu_threads_; + bool running_ = false; + protected: static bool is_initialized_; }; diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index b0247fe795b3eae17fe459c9a14b188663974870..ebbfd446a03de203d6af1a6d3f77ff392ba3ca90 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -219,6 +219,19 @@ class SingleGradOpMaker public: using GradOpBaseMakerBase::GradOpBaseMakerBase; + virtual const framework::Attribute& GetAttr(const std::string& name) const { + auto it = Attrs().find(name); + if (it == Attrs().end()) { + it = this->DefaultAttrsMap().find(name); + PADDLE_ENFORCE_EQ(it != this->DefaultAttrsMap().end(), true, + platform::errors::NotFound( + "Cannot find attribute [%s] in operator [%s]", name, + this->ForwardOpType())); + } + + return it->second; + } + std::shared_ptr operator()() const final { auto node = this->NewGradNode(); auto& inplace_map = this->GetInplaceMap(); @@ -228,6 +241,7 @@ class SingleGradOpMaker { imperative::TracedGradOp traced_grad_op(node); try { + traced_grad_op.SetDefaultAttrsMap(this->DefaultAttrsMap()); this->Apply(&traced_grad_op); } catch (platform::EnforceNotMet& exception) { framework::AppendErrorOpHint(traced_grad_op.Type(), &exception); diff --git a/paddle/fluid/framework/heter_service.h b/paddle/fluid/framework/heter_service.h index 3f65eaf3aa1216275edd8d5bb5b44f640f98625b..7e5bf138d9fa9270eef7b19e0b350301a2290ab7 100644 --- a/paddle/fluid/framework/heter_service.h +++ b/paddle/fluid/framework/heter_service.h @@ -72,299 +72,6 @@ class HeterXpuService : public HeterService { std::unordered_map handler_map_; }; -enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE }; - -class HeterTask { - public: - void Update() { - if (state_ == PULL_SPARSE) { - state_ = OP_RUN; - } else if (state_ == OP_RUN) { - state_ = XPU; - // state_ = PUSH_GRAD; - // state_ = PUSH_GRAD; - } else if (state_ == XPU) { - state_ = OP_RUN_END; - } else if (state_ == OP_RUN_END) { - state_ = PUSH_GRAD; - } else if (state_ == PUSH_GRAD) { - state_ = DONE; - } - } - void Reset() { - total_time = 0; - read_time = 0; - pack_time = 0; - pull_sparse_local_time = 0; - op_all_time = 0; - xpu_op_time = 0; - xpu_wait_time = 0; - cpu_op_time = 0; - collect_label_time = 0; - fill_sparse_time = 0; - push_sparse_time = 0; - gpu_2_cpu_time = 0; - cpu_2_gpu_time = 0; - timeline.Reset(); - } - void Show() { - std::cout << "features size " << features_.size() << std::endl; - for (size_t i = 0; i < features_.size(); ++i) { - std::cout << "features[" << i << "] size " << features_[i].size() - << std::endl; - } - } - void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch, - const ProgramDesc& program); - void PackGpuTask(Scope* thread_scope, DataFeed* reader, - const ProgramDesc& program); - - Scope* scope_{nullptr}; - int taskid_; - int cur_batch_; - HeterTaskState state_; - // cache - std::map> features_; - std::map> feature_labels_; - std::map>> feature_values_; - std::map>> feature_grads_; - std::map> sparse_push_keys_; - double total_time{0}; - double read_time{0}; - double pack_time{0}; - double pull_sparse_local_time{0}; - double op_all_time{0}; - double xpu_op_time{0}; - double xpu_wait_time{0}; - double cpu_op_time{0}; - double collect_label_time{0}; - double fill_sparse_time{0}; - double push_sparse_time{0}; - double gpu_2_cpu_time{0}; - double cpu_2_gpu_time{0}; - platform::Timer timeline; -}; -#endif -template -class HeterObjectPool { - public: - HeterObjectPool() {} - virtual ~HeterObjectPool(){}; - std::shared_ptr Get() { - std::lock_guard lock(mutex_); - if (pool_.empty()) { - num_ += 1; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - VLOG(3) << "pool construct size: " << num_; -#endif - return std::make_shared(); - } else { - auto ret = pool_.back(); - pool_.pop_back(); - return ret; - } - } - void Push(std::shared_ptr data) { - std::lock_guard lock(mutex_); - pool_.push_back(std::move(data)); - } - int Size() { - std::lock_guard lock(mutex_); - return pool_.size(); - } - std::shared_ptr& GetElement(int i) { return pool_[i]; } - - private: - std::vector> pool_; - std::mutex mutex_; - int num_{0}; -}; - -#ifdef PADDLE_WITH_PSLIB -struct BthreadMutextGuard { - BthreadMutextGuard(bthread_mutex_t* rho) { - mutex_ = rho; - bthread_mutex_lock(mutex_); - } - ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); } - bthread_mutex_t* mutex_; -}; - -template -class BtObjectPool { - public: - BtObjectPool() { - bthread_mutex_init(&mutex_, NULL); - bthread_cond_init(&cond_, NULL); - } - - virtual ~BtObjectPool() { - bthread_cond_destroy(&cond_); - bthread_mutex_destroy(&mutex_); - }; - - std::shared_ptr Get() { - BthreadMutextGuard guard(&mutex_); - while (pool_.empty()) { - bthread_cond_wait(&cond_, &mutex_); - } - auto ret = pool_.back(); - pool_.pop_back(); - return ret; - } - - void Push(std::shared_ptr data) { - BthreadMutextGuard guard(&mutex_); - pool_.push_back(std::move(data)); - bthread_cond_signal(&cond_); - } - - int Size() { return pool_.size(); } - - std::shared_ptr& GetElement(int i) { return pool_[i]; } - - private: - std::vector> pool_; - bthread_mutex_t mutex_; - bthread_cond_t cond_; - int num_{0}; -}; - -template -struct HeterNode { - K key; - T value; - HeterNode* prev; - HeterNode* next; -}; - -template -class HeterList { - public: - HeterList() : head_(new HeterNode), tail_(new HeterNode) { - head_->prev = NULL; - head_->next = tail_; - tail_->prev = head_; - tail_->next = NULL; - size = 0; - cap_ = 1e9; - } - - ~HeterList() { - delete head_; - delete tail_; - } - - void SetCap(int num) { cap_ = num; } - - bool TryPut(K& key, T& value) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this] { return size < cap_; }); - if (task_map_.find(key) != task_map_.end()) { - task_map_.erase(key); - return false; - } else { - HeterNode* node = new HeterNode; - node->key = key; - node->value = value; - map_[node->key] = node; - attach(node); - return true; - } - } - - bool Put(K& key, T& value) { - std::unique_lock lock(mutex_); - cond_.wait(lock, [this] { return size < cap_; }); - HeterNode* node = new HeterNode; - node->key = key; - node->value = value; - map_[node->key] = node; - attach(node); - return true; - } - - T TryGet(const K& key) { - std::lock_guard lock(mutex_); - auto iter = map_.find(key); - if (iter != map_.end()) { - HeterNode* node = iter->second; - detach(node); - cond_.notify_one(); - T ret = std::move(node->value); - map_.erase(key); - delete node; - return ret; - } - task_map_.insert(key); - return nullptr; - } - - T Get(const K& key) { - std::lock_guard lock(mutex_); - auto iter = map_.find(key); - if (iter != map_.end()) { - HeterNode* node = iter->second; - detach(node); - cond_.notify_one(); - T ret = std::move(node->value); - map_.erase(key); - delete node; - return ret; - } - return nullptr; - } - - T Get() { - std::lock_guard lock(mutex_); - HeterNode* node = head_->next; - if (node == tail_) { - return nullptr; - } else { - detach(node); - cond_.notify_one(); - T ret = std::move(node->value); - map_.erase(node->key); - delete node; - return ret; - } - } - - bool Empty() { - std::lock_guard lock(mutex_); - return head_->next == tail_; - } - - int Size() { - std::lock_guard lock(mutex_); - return size; - } - - private: - void detach(HeterNode* node) { - node->prev->next = node->next; - node->next->prev = node->prev; - size--; - } - - void attach(HeterNode* node) { - node->prev = head_; - node->next = head_->next; - head_->next->prev = node; - head_->next = node; - size++; - } - - private: - HeterNode* head_; - HeterNode* tail_; - std::unordered_map*> map_; - std::unordered_set task_map_; - std::mutex mutex_; - std::condition_variable cond_; - int cap_; - int size; -}; #endif } // namespace framework diff --git a/paddle/fluid/framework/heter_util.h b/paddle/fluid/framework/heter_util.h new file mode 100644 index 0000000000000000000000000000000000000000..eb9f3040afe25e8d8095dea70c5a8c731718f8cb --- /dev/null +++ b/paddle/fluid/framework/heter_util.h @@ -0,0 +1,333 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_PSLIB +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include "bthread/bthread.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/timer.h" + +namespace paddle { +namespace framework { +class DataFeed; +enum HeterTaskState { PULL_SPARSE, OP_RUN, XPU, OP_RUN_END, PUSH_GRAD, DONE }; + +class HeterTask { + public: + HeterTask() {} + virtual ~HeterTask() {} + + void Update() { + if (state_ == PULL_SPARSE) { + state_ = OP_RUN; + } else if (state_ == OP_RUN) { + state_ = XPU; + // state_ = PUSH_GRAD; + // state_ = PUSH_GRAD; + } else if (state_ == XPU) { + state_ = OP_RUN_END; + } else if (state_ == OP_RUN_END) { + state_ = PUSH_GRAD; + } else if (state_ == PUSH_GRAD) { + state_ = DONE; + } + } + void Reset() { + total_time = 0; + read_time = 0; + pack_time = 0; + pull_sparse_local_time = 0; + op_all_time = 0; + xpu_op_time = 0; + xpu_wait_time = 0; + cpu_op_time = 0; + collect_label_time = 0; + fill_sparse_time = 0; + push_sparse_time = 0; + gpu_2_cpu_time = 0; + cpu_2_gpu_time = 0; + timeline.Reset(); + } + void Show() { + std::cout << "features size " << features_.size() << std::endl; + for (size_t i = 0; i < features_.size(); ++i) { + std::cout << "features[" << i << "] size " << features_[i].size() + << std::endl; + } + } + void PackTask(Scope* scope, int taskid, DataFeed* reader, int cur_batch, + const ProgramDesc& program); + void PackGpuTask(Scope* thread_scope, DataFeed* reader, + const ProgramDesc& program); + + Scope* scope_{nullptr}; + int taskid_; + int cur_batch_; + HeterTaskState state_; + // cache + std::map> features_; + std::map> feature_labels_; + std::map>> feature_values_; + std::map>> feature_grads_; + std::map> sparse_push_keys_; + double total_time{0}; + double read_time{0}; + double pack_time{0}; + double pull_sparse_local_time{0}; + double op_all_time{0}; + double xpu_op_time{0}; + double xpu_wait_time{0}; + double cpu_op_time{0}; + double collect_label_time{0}; + double fill_sparse_time{0}; + double push_sparse_time{0}; + double gpu_2_cpu_time{0}; + double cpu_2_gpu_time{0}; + platform::Timer timeline; +}; +#endif +template +class HeterObjectPool { + public: + HeterObjectPool() {} + virtual ~HeterObjectPool() {} + std::shared_ptr Get() { + std::lock_guard lock(mutex_); + if (pool_.empty()) { + num_ += 1; + return std::make_shared(); + } else { + auto ret = pool_.back(); + pool_.pop_back(); + return ret; + } + } + void Push(std::shared_ptr data) { + std::lock_guard lock(mutex_); + pool_.push_back(std::move(data)); + } + int Size() { + std::lock_guard lock(mutex_); + return pool_.size(); + } + bool Empty() { + std::lock_guard lock(mutex_); + return pool_.empty(); + } + std::shared_ptr& GetElement(int i) { return pool_[i]; } + + private: + std::vector> pool_; + std::mutex mutex_; + int num_{0}; +}; + +#ifdef PADDLE_WITH_PSLIB +struct BthreadMutextGuard { + BthreadMutextGuard(bthread_mutex_t* rho) { + mutex_ = rho; + bthread_mutex_lock(mutex_); + } + ~BthreadMutextGuard() { bthread_mutex_unlock(mutex_); } + bthread_mutex_t* mutex_; +}; + +template +class BtObjectPool { + public: + BtObjectPool() { + bthread_mutex_init(&mutex_, NULL); + bthread_cond_init(&cond_, NULL); + } + + virtual ~BtObjectPool() { + bthread_cond_destroy(&cond_); + bthread_mutex_destroy(&mutex_); + } + + std::shared_ptr Get() { + BthreadMutextGuard guard(&mutex_); + while (pool_.empty()) { + bthread_cond_wait(&cond_, &mutex_); + } + auto ret = pool_.back(); + pool_.pop_back(); + return ret; + } + + void Push(std::shared_ptr data) { + BthreadMutextGuard guard(&mutex_); + pool_.push_back(std::move(data)); + bthread_cond_signal(&cond_); + } + + int Size() { return pool_.size(); } + + std::shared_ptr& GetElement(int i) { return pool_[i]; } + + private: + std::vector> pool_; + bthread_mutex_t mutex_; + bthread_cond_t cond_; + int num_{0}; +}; + +template +struct HeterNode { + K key; + T value; + HeterNode* prev; + HeterNode* next; +}; + +template +class HeterList { + public: + HeterList() : head_(new HeterNode), tail_(new HeterNode) { + head_->prev = NULL; + head_->next = tail_; + tail_->prev = head_; + tail_->next = NULL; + size = 0; + cap_ = 1e9; + } + + ~HeterList() { + delete head_; + delete tail_; + } + + void SetCap(int num) { cap_ = num; } + + bool TryPut(K& key, T& value) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this] { return size < cap_; }); + if (task_map_.find(key) != task_map_.end()) { + task_map_.erase(key); + return false; + } else { + HeterNode* node = new HeterNode; + node->key = key; + node->value = value; + map_[node->key] = node; + attach(node); + return true; + } + } + + bool Put(K& key, T& value) { + std::unique_lock lock(mutex_); + cond_.wait(lock, [this] { return size < cap_; }); + HeterNode* node = new HeterNode; + node->key = key; + node->value = value; + map_[node->key] = node; + attach(node); + return true; + } + + T TryGet(const K& key) { + std::lock_guard lock(mutex_); + auto iter = map_.find(key); + if (iter != map_.end()) { + HeterNode* node = iter->second; + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(key); + delete node; + return ret; + } + task_map_.insert(key); + return nullptr; + } + + T Get(const K& key) { + std::lock_guard lock(mutex_); + auto iter = map_.find(key); + if (iter != map_.end()) { + HeterNode* node = iter->second; + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(key); + delete node; + return ret; + } + return nullptr; + } + + T Get() { + std::lock_guard lock(mutex_); + HeterNode* node = head_->next; + if (node == tail_) { + return nullptr; + } else { + detach(node); + cond_.notify_one(); + T ret = std::move(node->value); + map_.erase(node->key); + delete node; + return ret; + } + } + + bool Empty() { + std::lock_guard lock(mutex_); + return head_->next == tail_; + } + + int Size() { + std::lock_guard lock(mutex_); + return size; + } + + private: + void detach(HeterNode* node) { + node->prev->next = node->next; + node->next->prev = node->prev; + size--; + } + + void attach(HeterNode* node) { + node->prev = head_; + node->next = head_->next; + head_->next->prev = node; + head_->next = node; + size++; + } + + private: + HeterNode* head_; + HeterNode* tail_; + std::unordered_map*> map_; + std::unordered_set task_map_; + std::mutex mutex_; + std::condition_variable cond_; + int cap_; + int size; +}; +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/heterbox_trainer.cc b/paddle/fluid/framework/heterbox_trainer.cc deleted file mode 100644 index 1f6dc39ae851dfa5dc4790c4a3994a19981be3e0..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/heterbox_trainer.cc +++ /dev/null @@ -1,275 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include "io/fs.h" -#include "paddle/fluid/framework/data_feed_factory.h" -#include "paddle/fluid/framework/data_set.h" -#include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" -#include "paddle/fluid/framework/trainer.h" -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \ - defined PADDLE_WITH_XPU) && \ - (defined PADDLE_WITH_PSLIB) -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/cuda_device_guard.h" -#endif -namespace paddle { -namespace framework { - -void HeterBoxTrainer::Initialize(const TrainerDesc& trainer_desc, - Dataset* dataset) { - thread_num_ = trainer_desc.thread_num(); - param_ = trainer_desc.downpour_param(); - for (int i = 0; i < param_.dense_table_size(); ++i) { - uint64_t table_id = static_cast(param_.dense_table(i).table_id()); - auto table = param_.dense_table(i); - dense_grad_names_[table_id].resize(table.dense_grad_name_size()); - for (int j = 0; j < table.dense_grad_name_size(); ++j) { - dense_grad_names_[table_id][j] = table.dense_grad_name(j); - } - } - RegisterHeterCallback(); - scale_datanorm_ = trainer_desc.scale_datanorm(); - int place_num = trainer_desc.worker_places_size(); - const std::vector readers = - dataset->GetReaders(); - for (int i = 0; i < place_num; ++i) { - int num = trainer_desc.worker_places(i); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace place = platform::CUDAPlace(num); - platform::CUDADeviceGuard guard(place.device); - gpuStream_t stream; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); -#endif - copy_streams_.push_back(stream); - places_.push_back(place); - gpuEvent_t event; -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - hipEventCreateWithFlags(&event, hipEventDisableTiming)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS( - cudaEventCreateWithFlags(&event, cudaEventDisableTiming)); -#endif - events_.push_back(event); -#endif -#ifdef PADDLE_WITH_XPU - platform::XPUPlace place = platform::XPUPlace(num); - places_.push_back(place); -#endif - } - for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); - i++) { - need_merge_var_names_.push_back( - trainer_desc.downpour_param().stat_var_names(i)); - } - VLOG(3) << "going to initialize pull dense worker"; - pull_dense_worker_ = PullDenseWorker::GetInstance(); - pull_dense_worker_->Initialize(trainer_desc); - VLOG(3) << "initialize pull dense worker"; - SetDebug(trainer_desc.debug()); - fleet_ptr_ = FleetWrapper::GetInstance(); - trainer_desc_ = trainer_desc; - workers_.resize(place_num); - for (int i = 0; i < place_num; ++i) { - workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( - trainer_desc.device_worker_name()); - workers_[i]->SetDeviceIndex(i); - workers_[i]->SetDataFeed(readers[i]); - workers_[i]->Initialize(trainer_desc); - workers_[i]->SetWorkerNum(place_num); - } -} - -void HeterBoxTrainer::DumpWork(int tid) {} - -void HeterBoxTrainer::RegisterHeterCallback() { - auto fleet_ptr = FleetWrapper::GetInstance(); - fleet_ptr->RegisterHeterCallback([this](int worker, int taskid) { - // workers_[worker]->Schedule(taskid); - }); -} - -void HeterBoxTrainer::InitTrainerEnv(const ProgramDesc& main_program, - const platform::Place& place) { - for (size_t i = 0; i < places_.size(); ++i) { - workers_[i]->SetPlace(places_[i]); - workers_[i]->SetStream(copy_streams_[i]); - workers_[i]->SetEvent(events_[i]); - workers_[i]->SetReaderPlace(platform::CPUPlace()); - workers_[i]->SetRootScope(root_scope_); - workers_[i]->CreateDeviceResource(main_program); // Program - workers_[i]->BindingDataFeedMemory(); -#ifdef PADDLE_WITH_PSLIB - workers_[i]->CacheProgram(main_program); -#endif - } - for (size_t num = 0; num < places_.size(); ++num) { - auto place = places_[num]; - Scope* scope = workers_[num]->GetThreadScope(); - auto stream = copy_streams_[num]; - auto event = events_[num]; - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; - platform::CUDADeviceGuard guard(dev_id); - auto& block = main_program.Block(0); - for (auto& var : block.AllVars()) { - if (var->Persistable()) { - auto name = var->Name(); - Variable* root_var = root_scope_->FindVar(name); - if (!root_var) { - continue; - } - LoDTensor* root_tensor = root_var->GetMutable(); - auto* ptr = scope->Var(name); - InitializeVariable(ptr, proto::VarType::LOD_TENSOR); - LoDTensor* thread_tensor = ptr->GetMutable(); - -#define HeterMemcpyFunc(cpp_type, proto_type) \ - do { \ - if (root_tensor->type() == proto_type) { \ - HeterMemCpy(thread_tensor, root_tensor, place, stream); \ - } \ - } while (0) - _ForEachDataType_(HeterMemcpyFunc); - } - } -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, stream)); - hipEventSynchronize(event); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, stream)); - cudaEventSynchronize(event); -#endif - } - place_ = place; -} - -template -void HeterBoxTrainer::HeterMemCpy(LoDTensor* thread_tensor, - LoDTensor* root_tensor, - const paddle::platform::Place& thread_place, - gpuStream_t stream) { - T* thread_ptr = - thread_tensor->mutable_data(root_tensor->dims(), thread_place); - T* root_ptr = root_tensor->data(); - if (platform::is_cpu_place(root_tensor->place())) { - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, - platform::CPUPlace(), root_ptr, - sizeof(T) * root_tensor->numel(), stream); - } else { - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, thread_place), thread_ptr, - BOOST_GET_CONST(platform::CUDAPlace, root_tensor->place()), - root_ptr, sizeof(T) * root_tensor->numel(), stream); - } -} - -void HeterBoxTrainer::InitOtherEnv(const ProgramDesc& main_program) { - pull_dense_worker_->SetRootScope(root_scope_); - pull_dense_worker_->CreatePinVar(); - for (size_t i = 0; i < places_.size(); ++i) { - pull_dense_worker_->AddThreadScope(workers_[i]->GetThreadScope()); - pull_dense_worker_->AddPlace(places_[i]); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - pull_dense_worker_->AddStream(copy_streams_[i]); -#endif - } - VLOG(3) << "init other env done."; -} - -void HeterBoxTrainer::Run() { - int pull_thread_num = 3 * places_.size(); - for (size_t thidx = 0; thidx < places_.size(); ++thidx) { - workers_[thidx]->device_reader_->Start(); - std::dynamic_pointer_cast( - workers_[thidx]) - ->ResetStat(); - } - for (int i = 0; i < pull_thread_num; ++i) { - int worker_id = i % places_.size(); - pull_threads_.push_back( - std::thread(&DeviceWorker::ProduceTasks, workers_[worker_id].get())); - } - for (size_t thidx = 0; thidx < places_.size(); ++thidx) { - threads_.push_back( - std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get())); - } -} - -template -void HeterBoxTrainer::MergeToRootScope(LoDTensor* root_tensor, - LoDTensor* tensor) { - LoDTensor tmp_root; - TensorCopy(*root_tensor, platform::CPUPlace(), &tmp_root); - T* tmp_root_data = tmp_root.data(); - LoDTensor tmp_tensor; - TensorCopy(*tensor, platform::CPUPlace(), &tmp_tensor); - T* data = tmp_tensor.data(); - for (int i = 0; i < tmp_tensor.numel(); i++) { - tmp_root_data[i] += data[i]; - } - TensorCopy(tmp_root, platform::CPUPlace(), root_tensor); -} - -Scope* HeterBoxTrainer::GetWorkerScope(int thread_id) { return nullptr; } - -void HeterBoxTrainer::Finalize() { - for (auto& th : pull_threads_) { - th.join(); - } - for (auto& th : threads_) { - th.join(); - } - for (size_t i = 0; i < need_merge_var_names_.size(); i++) { - Variable* root_var = root_scope_->FindVar(need_merge_var_names_[i]); - if (root_var == nullptr) { - continue; - } - LoDTensor* root_tensor = root_var->GetMutable(); - - for (size_t j = 0; j < places_.size(); j++) { - Scope* cur_thread_scope = workers_[j]->GetThreadScope(); - Variable* thread_var = - cur_thread_scope->FindVar(need_merge_var_names_[i]); - if (thread_var == nullptr) { - continue; - } - LoDTensor* thread_tensor = thread_var->GetMutable(); -#define MergeCallback(cpp_type, proto_type) \ - do { \ - if (root_tensor->type() == proto_type) { \ - if (thread_tensor->type() != proto_type) { \ - VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \ - << "] " << need_merge_var_names_[i] \ - << ", root tensor type=" << root_tensor->type() \ - << ", thread tensor type=" << thread_tensor->type(); \ - exit(-1); \ - } \ - MergeToRootScope(root_tensor, thread_tensor); \ - } \ - } while (0) - _ForEachDataType_(MergeCallback); - } - } - pull_dense_worker_->MergeDenseParam(); - root_scope_->DropKids(); -} -} // namespace framework -} // namespace paddle -#endif diff --git a/paddle/fluid/framework/heterbox_worker.cc b/paddle/fluid/framework/heterbox_worker.cc deleted file mode 100644 index 726b651fcf4ec7409eee7d1893803ef67d87db7f..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/heterbox_worker.cc +++ /dev/null @@ -1,753 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/device_worker.h" -#include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/fleet_wrapper.h" -#include "paddle/fluid/framework/fleet/heter_wrapper.h" -#include "paddle/fluid/platform/cpu_helper.h" -#include "paddle/fluid/string/string_helper.h" - -#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ - (defined PADDLE_WITH_PSLIB) -#include "paddle/fluid/platform/cuda_device_guard.h" - -#if defined _WIN32 || defined __APPLE__ -#else -#define _LINUX -#endif - -namespace paddle { -namespace framework { - -void HeterBoxWorker::Initialize(const TrainerDesc& desc) { - param_ = desc.downpour_param(); - mpi_rank_ = desc.mpi_rank(); - trainer_desc_ = desc; - for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) { - send_var_list_.push_back(trainer_desc_.xpu_recv_list(i)); - } - for (int i = 0; i < param_.sparse_table_size(); ++i) { - uint64_t table_id = - static_cast(param_.sparse_table(i).table_id()); - TableParameter table = param_.sparse_table(i); - sparse_key_names_[table_id].resize(table.sparse_key_name_size()); - for (int j = 0; j < table.sparse_key_name_size(); ++j) { - sparse_key_names_[table_id][j] = table.sparse_key_name(j); - } - sparse_value_names_[table_id].resize(table.sparse_value_name_size()); - for (int j = 0; j < table.sparse_value_name_size(); ++j) { - sparse_value_names_[table_id][j] = table.sparse_value_name(j); - } - sparse_grad_names_[table_id].resize(table.sparse_grad_name_size()); - for (int j = 0; j < table.sparse_grad_name_size(); ++j) { - sparse_grad_names_[table_id][j] = table.sparse_grad_name(j); - } - label_var_name_[table_id] = table.label_var_name(); - sparse_push_keys_[table_id] = std::vector(); - } - - for (int i = 0; i < param_.dense_table_size(); ++i) { - uint64_t table_id = static_cast(param_.dense_table(i).table_id()); - auto table = param_.dense_table(i); - dense_value_names_[table_id].resize(table.dense_value_name_size()); - for (int j = 0; j < table.dense_value_name_size(); ++j) { - dense_value_names_[table_id][j] = table.dense_value_name(j); - } - dense_grad_names_[table_id].resize(table.dense_grad_name_size()); - for (int j = 0; j < table.dense_grad_name_size(); ++j) { - dense_grad_names_[table_id][j] = table.dense_grad_name(j); - } - } - - skip_ops_.resize(param_.skip_ops_size()); - for (int i = 0; i < param_.skip_ops_size(); ++i) { - skip_ops_[i] = param_.skip_ops(i); - } - for (int i = 0; i < param_.stat_var_names_size(); ++i) { - stat_var_name_map_[param_.stat_var_names(i)] = 1; - } - - need_to_push_sparse_ = param_.push_sparse(); - need_to_push_dense_ = param_.push_dense(); - - fleet_ptr_ = FleetWrapper::GetInstance(); - fetch_config_ = desc.fetch_config(); - use_cvm_ = desc.use_cvm(); - // for sparse value accessor, embedding only - no_cvm_ = desc.no_cvm(); - scale_datanorm_ = desc.scale_datanorm(); - dump_slot_ = desc.dump_slot(); - dump_fields_.resize(desc.dump_fields_size()); - for (int i = 0; i < desc.dump_fields_size(); ++i) { - dump_fields_[i] = desc.dump_fields(i); - } - adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); - need_dump_param_ = false; - dump_param_.resize(desc.dump_param_size()); - for (int i = 0; i < desc.dump_param_size(); ++i) { - dump_param_[i] = desc.dump_param(i); - } - if (desc.dump_param_size() != 0) { - need_dump_param_ = true; - } - for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { - check_nan_var_names_.push_back(desc.check_nan_var_names(i)); - } - copy_table_config_ = desc.copy_table_config(); - for (int i = 0; i < copy_table_config_.src_sparse_tables_size(); ++i) { - uint64_t src_table = copy_table_config_.src_sparse_tables(i); - uint64_t dest_table = copy_table_config_.dest_sparse_tables(i); - VLOG(3) << "copy_sparse_tables_ push back " << src_table << "->" - << dest_table; - copy_sparse_tables_.push_back(std::make_pair(src_table, dest_table)); - } - for (int i = 0; i < copy_table_config_.src_dense_tables_size(); ++i) { - uint64_t src_table = copy_table_config_.src_dense_tables(i); - uint64_t dest_table = copy_table_config_.dest_dense_tables(i); - VLOG(3) << "copy_dense_tables_ push back " << src_table << "->" - << dest_table; - copy_dense_tables_.push_back(std::make_pair(src_table, dest_table)); - } - for (auto& m : copy_table_config_.table_denpendency_map()) { - if (sparse_key_names_.find(m.key()) != sparse_key_names_.end()) { - // currently only support one dependency - for (auto& value : m.values()) { - table_dependency_[m.key()] = value; - } - } - } - pull_queue_ = paddle::framework::MakeChannel>(); - push_queue_ = paddle::framework::MakeChannel>(); -} - -void HeterBoxWorker::SetChannelWriter(ChannelObject* queue) { - writer_.Reset(queue); -} - -void HeterBoxWorker::SetNeedDump(bool need_dump_field) { - need_dump_field_ = need_dump_field; -} - -void HeterBoxWorker::DumpParam() {} - -void HeterBoxWorker::CollectLabelInfo(std::shared_ptr task, - size_t table_idx) { - if (no_cvm_) { - return; - } - uint64_t table_id = static_cast( - param_.program_config(0).pull_sparse_table_id(table_idx)); - - TableParameter table; - for (auto i : param_.sparse_table()) { - if (i.table_id() == table_id) { - table = i; - break; - } - } - auto& feature = (task->features_)[table_id]; - auto& feature_label = (task->feature_labels_)[table_id]; - Scope* scope = task->scope_; - feature_label.resize(feature.size()); - Variable* var = scope->FindVar(label_var_name_[table_id]); - LoDTensor* tensor = var->GetMutable(); - int64_t* label_ptr = tensor->data(); - - size_t global_index = 0; - for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { - VLOG(3) << "sparse_key_names_[" << i - << "]: " << sparse_key_names_[table_id][i]; - Variable* fea_var = scope->FindVar(sparse_key_names_[table_id][i]); - if (fea_var == nullptr) { - continue; - } - LoDTensor* tensor = fea_var->GetMutable(); - CHECK(tensor != nullptr) << "tensor of var " - << sparse_key_names_[table_id][i] << " is null"; - - // skip slots which do not have embedding - Variable* emb_var = scope->FindVar(sparse_value_names_[table_id][i]); - if (emb_var == nullptr) { - continue; - } - int64_t* ids = tensor->data(); - size_t fea_idx = 0; - // tensor->lod()[0].size() == batch_size + 1 - for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) { - for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) { - // should be skipped feasign defined in protobuf - if (ids[fea_idx] == 0u) { - continue; - } - feature_label[global_index++] = - static_cast(label_ptr[lod_idx - 1]); - } - } - } - CHECK(global_index == feature.size()) - << "expect fea info size:" << feature.size() << " real:" << global_index; -} - -void HeterBoxWorker::FillSparseValue(std::shared_ptr task, - size_t table_idx) { - uint64_t table_id = static_cast( - param_.program_config(0).pull_sparse_table_id(table_idx)); - - TableParameter table; - for (auto i : param_.sparse_table()) { - if (i.table_id() == table_id) { - table = i; - break; - } - } - - auto& fea_value = (task->feature_values_)[table_id]; - Scope* scope = task->scope_; - auto fea_idx = 0u; - - std::vector init_value(table.fea_dim()); - for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { - std::string slot_name = sparse_key_names_[table_id][i]; - std::string emb_slot_name = sparse_value_names_[table_id][i]; - Variable* var = scope->FindVar(slot_name); - if (var == nullptr) { - continue; - } - LoDTensor* tensor = var->GetMutable(); - CHECK(tensor != nullptr) << "tensor of var " << slot_name << " is null"; - int64_t* ids = tensor->data(); - int len = tensor->numel(); - Variable* var_emb = scope->FindVar(emb_slot_name); - if (var_emb == nullptr) { - continue; - } - LoDTensor* tensor_emb = var_emb->GetMutable(); - float* ptr = tensor_emb->mutable_data({len, table.emb_dim()}, - platform::CPUPlace()); - // memset(ptr, 0, sizeof(float) * len * table.emb_dim()); - auto& tensor_lod = tensor->lod()[0]; - LoD data_lod{tensor_lod}; - tensor_emb->set_lod(data_lod); - - bool is_nid = (adjust_ins_weight_config_.need_adjust() && - adjust_ins_weight_config_.nid_slot() == emb_slot_name); - if (is_nid) { - nid_show_.clear(); - } - int nid_ins_index = 0; - - for (int index = 0; index < len; ++index) { - if (use_cvm_ || no_cvm_) { - if (ids[index] == 0u) { - memcpy(ptr + table.emb_dim() * index, init_value.data(), - sizeof(float) * table.emb_dim()); - if (is_nid) { - nid_show_.push_back(-1); - ++nid_ins_index; - } - continue; - } - memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(), - sizeof(float) * table.emb_dim()); - if (is_nid && - static_cast(index) == tensor->lod()[0][nid_ins_index]) { - nid_show_.push_back(fea_value[fea_idx][0]); - ++nid_ins_index; - } - fea_idx++; - } else { - if (ids[index] == 0u) { - memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, - sizeof(float) * table.emb_dim()); - if (is_nid) { - nid_show_.push_back(-1); - ++nid_ins_index; - } - continue; - } - memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2, - sizeof(float) * table.emb_dim()); - if (is_nid && - static_cast(index) == tensor->lod()[0][nid_ins_index]) { - nid_show_.push_back(fea_value[fea_idx][0]); - ++nid_ins_index; - } - fea_idx++; - } - } - } -} - -void HeterBoxWorker::AdjustInsWeight(std::shared_ptr task) { -#ifdef _LINUX - // check var and tensor not null - Scope* scope = task->scope_; - if (!adjust_ins_weight_config_.need_adjust()) { - VLOG(0) << "need_adjust=false, skip adjust ins weight"; - return; - } - Variable* nid_var = scope->FindVar(adjust_ins_weight_config_.nid_slot()); - if (nid_var == nullptr) { - VLOG(0) << "nid slot var " << adjust_ins_weight_config_.nid_slot() - << " is nullptr, skip adjust ins weight"; - return; - } - LoDTensor* nid_tensor = nid_var->GetMutable(); - if (nid_tensor == nullptr) { - VLOG(0) << "tensor of nid slot var " << adjust_ins_weight_config_.nid_slot() - << " is nullptr, skip adjust ins weight"; - return; - } - Variable* ins_weight_var = - scope->FindVar(adjust_ins_weight_config_.ins_weight_slot()); - if (ins_weight_var == nullptr) { - VLOG(0) << "ins weight var " << adjust_ins_weight_config_.ins_weight_slot() - << " is nullptr, skip adjust ins weight"; - return; - } - LoDTensor* ins_weight_tensor = ins_weight_var->GetMutable(); - if (ins_weight_tensor == nullptr) { - VLOG(0) << "tensor of ins weight tensor " - << adjust_ins_weight_config_.ins_weight_slot() - << " is nullptr, skip adjust ins weight"; - return; - } - - float* ins_weights = ins_weight_tensor->data(); - size_t len = ins_weight_tensor->numel(); // len = batch size - // here we assume nid_show slot only has one feasign in each instance - CHECK(len == nid_show_.size()) << "ins_weight size should be equal to " - << "nid_show size, " << len << " vs " - << nid_show_.size(); - float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold(); - float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio(); - int64_t nid_adjw_num = 0; - double nid_adjw_weight = 0.0; - size_t ins_index = 0; - for (size_t i = 0; i < len; ++i) { - float nid_show = nid_show_[i]; - VLOG(3) << "nid_show " << nid_show; - if (nid_show < 0) { - VLOG(3) << "nid_show < 0, continue"; - continue; - } - float ins_weight = 1.0; - if (nid_show >= 0 && nid_show < nid_adjw_threshold) { - ins_weight = log(M_E + - (nid_adjw_threshold - nid_show) / nid_adjw_threshold * - nid_adjw_ratio); - // count nid adjw insnum and weight - ++nid_adjw_num; - nid_adjw_weight += ins_weight; - // choose large ins weight - VLOG(3) << "ins weight new " << ins_weight << ", ins weight origin " - << ins_weights[ins_index]; - if (ins_weight > ins_weights[ins_index]) { - VLOG(3) << "ins " << ins_index << " weight changes to " << ins_weight; - ins_weights[ins_index] = ins_weight; - } - ++ins_index; - } - } - VLOG(3) << "nid adjw info: total_adjw_num: " << nid_adjw_num - << ", avg_adjw_weight: " << nid_adjw_weight; -#endif -} - -void HeterBoxWorker::TrainFiles() { - VLOG(3) << "Begin to train files"; - platform::SetNumThreads(1); - need_to_push_dense_ = false; - while (1) { - VLOG(3) << "before heter task"; - std::shared_ptr task; - - if (!pull_queue_->Get(task)) { - VLOG(3) << "get task"; - break; - } - VLOG(3) << "get task done"; - Scope* scope = task->scope_->kids().front(); - VLOG(3) << "get kid done"; - // do computation here - task->timeline.Start(); - for (auto& op : ops_) { - if (op->HasAttr("op_device")) { - auto device = op->Attr("op_device"); - if (device != "gpu") { - continue; - } - } - bool need_skip = false; - for (auto t = 0u; t < skip_ops_.size(); ++t) { - if (op->Type().find(skip_ops_[t]) != std::string::npos) { - need_skip = true; - break; - } - } - if (!need_skip) { - op->Run(*(scope), place_); - } - } - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - task->timeline.Pause(); - task->xpu_op_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - push_queue_->Put(task); - } -} - -void HeterTask::PackGpuTask(Scope* thread_scope, DataFeed* reader, - const ProgramDesc& program) { - auto& block = program.Block(0); - if (!scope_) { - scope_ = &(thread_scope->NewScope()); - for (auto& var : block.AllVars()) { - if (!var->Persistable()) { - auto* ptr = scope_->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - } - } - } - reader->AssignFeedVar(*scope_); - cur_batch_ = reader->Next(); -} - -void HeterBoxWorker::ResetStat() { - total_time_ = 0; - read_time_ = 0; - pack_time_ = 0; - pull_sparse_local_time_ = 0; - op_all_time_ = 0; - xpu_op_time_ = 0; - xpu_wait_time_ = 0; - cpu_op_time_ = 0; - collect_label_time_ = 0; - fill_sparse_time_ = 0; - push_sparse_time_ = 0; - gpu_2_cpu_time_ = 0; - cpu_2_gpu_time_ = 0; - total_inst_ = 0; -} - -void HeterBoxWorker::ProduceTasks() { - need_to_push_dense_ = false; - while (1) { - std::shared_ptr task; - task = object_pool_.Get(); - task->Reset(); - { - std::lock_guard lock(mutex_); - task->timeline.Start(); - task->PackGpuTask(thread_scope_, device_reader_, program_); - task->timeline.Pause(); - task->pack_time = task->timeline.ElapsedSec(); - task->total_time += task->pack_time; - if (task->cur_batch_ <= 0) { - if (!pull_queue_->Closed() && batch_cnt_ == done_cnt_) { - pull_queue_->Close(); - } - break; - } - batch_cnt_ += 1; - } - for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); - ++i) { - uint64_t tid = static_cast( - param_.program_config(0).pull_sparse_table_id(i)); - TableParameter table; - for (auto j : param_.sparse_table()) { - if (j.table_id() == tid) { - table = j; - break; - } - } - task->timeline.Start(); - fleet_ptr_->HeterPullSparseVars(thread_id_, task, tid, - sparse_key_names_[tid], table.fea_dim(), - sparse_value_names_[tid]); - task->timeline.Pause(); - task->pull_sparse_local_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - - task->timeline.Start(); - CollectLabelInfo(task, i); - task->timeline.Pause(); - task->collect_label_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - - task->timeline.Start(); - FillSparseValue(task, i); - task->timeline.Pause(); - task->fill_sparse_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - - auto nid_iter = std::find(sparse_value_names_[tid].begin(), - sparse_value_names_[tid].end(), - adjust_ins_weight_config_.nid_slot()); - if (nid_iter != sparse_value_names_[tid].end()) { - AdjustInsWeight(task); - } - } - - task->timeline.Start(); - size_t op_index = 0; - for (; op_index < ops_.size(); ++op_index) { - auto& op = ops_[op_index]; - if (op->HasAttr("op_device")) { - auto device = op->Attr("op_device"); - if (device == "gpu") { - break; - } - } - bool need_skip = false; - for (auto t = 0u; t < skip_ops_.size(); ++t) { - if (op->Type().find(skip_ops_[t]) != std::string::npos) { - need_skip = true; - break; - } - } - if (!need_skip) { - op->Run(*(task->scope_), platform::CPUPlace()); - } - } - - task->timeline.Pause(); - task->cpu_op_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - - task->timeline.Start(); - // prepare for gpu - Scope* cpu_scope = task->scope_; - Scope* gpu_scope = nullptr; - if (cpu_scope->kids().empty()) { - gpu_scope = &cpu_scope->NewScope(); - } else { - gpu_scope = cpu_scope->kids().front(); - } - for (const std::string& name : send_var_list_) { - const LoDTensor& cpu_tensor = cpu_scope->FindVar(name)->Get(); - LoDTensor* gpu_tensor = gpu_scope->Var(name)->GetMutable(); - gpu_tensor->set_lod(cpu_tensor.lod()); - gpu_tensor->Resize(cpu_tensor.dims()); - gpu_tensor->set_layout(cpu_tensor.layout()); - void* gpu_ptr = gpu_tensor->mutable_data(place_, cpu_tensor.type()); - const void* cpu_ptr = cpu_tensor.data(); - memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, place_), gpu_ptr, - platform::CPUPlace(), cpu_ptr, - cpu_tensor.numel() * SizeOfType(cpu_tensor.type()), - copy_stream_); - } - task->timeline.Pause(); - task->cpu_2_gpu_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - pull_queue_->Put(task); - push_queue_->Get(task); - - int need_copy_grad = 1; - task->timeline.Start(); - for (; op_index < ops_.size(); ++op_index) { - auto& op = ops_[op_index]; - if (op->HasAttr("op_device")) { - auto device = op->Attr("op_device"); - if (device == "gpu") { - continue; - } - } - bool need_skip = false; - for (auto t = 0u; t < skip_ops_.size(); ++t) { - if (op->Type().find(skip_ops_[t]) != std::string::npos) { - need_skip = true; - break; - } - } - if (!need_skip) { - need_copy_grad = 0; - op->Run(*(task->scope_), platform::CPUPlace()); - } - } - task->timeline.Pause(); - task->cpu_op_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - - VLOG(3) << "fill sparse value for all sparse table done."; - for (std::string& var_name : check_nan_var_names_) { - Variable* var = (task->scope_)->FindVar(var_name); - if (var == nullptr) { - continue; - } - LoDTensor* tensor = var->GetMutable(); - if (tensor == nullptr) { - continue; - } - PADDLE_ENFORCE_EQ(framework::TensorContainsInf(*tensor), false, - platform::errors::InvalidArgument( - "Tensor %s contains Inf.", var_name)); - PADDLE_ENFORCE_EQ(framework::TensorContainsNAN(*tensor), false, - platform::errors::InvalidArgument( - "Tensor %s contains NAN.", var_name)); - } - - if (need_to_push_sparse_) { - // push gradients here - for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); - ++i) { - uint64_t tid = static_cast( - param_.program_config(0).push_sparse_table_id(i)); - TableParameter table; - for (auto i : param_.sparse_table()) { - if (i.table_id() == tid) { - table = i; - break; - } - } - Scope* src_scope = task->scope_; - Scope* dest_scope = nullptr; - task->timeline.Start(); - if (need_copy_grad) { - if (cpu_scope->kids().empty()) { - dest_scope = &src_scope->NewScope(); - } else { - dest_scope = src_scope->kids().front(); - } - auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; - platform::CUDADeviceGuard guard(dev_id); - - for (const std::string& name : sparse_grad_names_[tid]) { - const LoDTensor& src_tensor = - src_scope->FindVar(name)->Get(); - LoDTensor* dest_tensor = - dest_scope->Var(name)->GetMutable(); - dest_tensor->set_lod(src_tensor.lod()); - dest_tensor->Resize(src_tensor.dims()); - dest_tensor->set_layout(src_tensor.layout()); - void* dest_ptr = dest_tensor->mutable_data(platform::CPUPlace(), - src_tensor.type()); - const void* src_ptr = src_tensor.data(); - memory::Copy(platform::CPUPlace(), dest_ptr, - BOOST_GET_CONST(platform::CUDAPlace, place_), src_ptr, - src_tensor.numel() * SizeOfType(src_tensor.type()), - copy_stream_); - } - } else { - dest_scope = task->scope_; - } - task->timeline.Pause(); - task->gpu_2_cpu_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - - task->timeline.Start(); - fleet_ptr_->HeterPushSparseVars( - task, *(dest_scope), tid, sparse_key_names_[tid], - sparse_grad_names_[tid], table.emb_dim(), &push_sparse_status_, - use_cvm_, dump_slot_, no_cvm_); - task->timeline.Pause(); - task->push_sparse_time += task->timeline.ElapsedSec(); - task->total_time += task->timeline.ElapsedSec(); - } - } - - if (need_to_push_sparse_) { - VLOG(3) << "push sparse gradient done."; - int32_t tmp_push_sparse_wait_times = -1; - static uint32_t push_sparse_wait_times = - static_cast(tmp_push_sparse_wait_times); - if (push_sparse_status_.size() >= push_sparse_wait_times) { - for (auto& t : push_sparse_status_) { - t.wait(); - } - push_sparse_status_.resize(0); - } - - if (tmp_push_sparse_wait_times == -1) { - push_sparse_status_.resize(0); - } - } - { - std::lock_guard lock(mutex_); - total_time_ += task->total_time; - read_time_ += task->read_time; - pack_time_ += task->pack_time; - pull_sparse_local_time_ += task->pull_sparse_local_time; - op_all_time_ += task->op_all_time; - xpu_op_time_ += task->xpu_op_time; - xpu_wait_time_ += task->xpu_wait_time; - cpu_op_time_ += task->cpu_op_time; - collect_label_time_ += task->collect_label_time; - fill_sparse_time_ += task->fill_sparse_time; - push_sparse_time_ += task->push_sparse_time; - gpu_2_cpu_time_ += task->gpu_2_cpu_time; - cpu_2_gpu_time_ += task->cpu_2_gpu_time; - total_inst_ += task->cur_batch_; - } - done_cnt_.fetch_add(1, std::memory_order_relaxed); - if (thread_id_ == 0) { - // should be configured here - if (done_cnt_ > 0 && done_cnt_ % 100 == 0) { - fprintf(stderr, "cpu_2_gpu total time: %fs\n", - cpu_2_gpu_time_ / done_cnt_); - fprintf(stderr, "gpu_2_cpu run total time: %fs\n", - gpu_2_cpu_time_ / done_cnt_); - fprintf(stderr, "cpu op run total time: %fs\n", - cpu_op_time_ / done_cnt_); - fprintf(stderr, "xpu op run total time: %fs\n", - xpu_op_time_ / done_cnt_); - fprintf(stderr, "xpu wait total time: %fs\n", - xpu_wait_time_ / done_cnt_); - fprintf(stderr, "pack task time: %fs\n", pack_time_ / done_cnt_); - fprintf(stderr, "train total time: %fs\n", total_time_ / done_cnt_); - fprintf(stderr, "pull sparse local time: %fs\n", - pull_sparse_local_time_ / done_cnt_); - fprintf(stderr, "fill sparse time: %fs\n", - fill_sparse_time_ / done_cnt_); - fprintf(stderr, "push sparse time: %fs\n", - push_sparse_time_ / done_cnt_); - fprintf(stderr, "collect label time: %fs\n", - collect_label_time_ / done_cnt_); - fprintf(stderr, "mean read time: %fs\n", read_time_ / done_cnt_); - fprintf(stderr, "IO percent: %f\n", read_time_ / total_time_ * 100); - fprintf(stderr, "cpu_2_gpu run percent: %f\n", - cpu_2_gpu_time_ / total_time_ * 100); - fprintf(stderr, "gpu_2_cpu run percent: %f\n", - gpu_2_cpu_time_ / total_time_ * 100); - fprintf(stderr, "cpu op run percent: %f\n", - cpu_op_time_ / total_time_ * 100); - fprintf(stderr, "xpu op run percent: %f\n", - xpu_op_time_ / total_time_ * 100); - fprintf(stderr, "xpu wait percent: %f\n", - xpu_wait_time_ / total_time_ * 100); - fprintf(stderr, "pack task percent: %f\n", - pack_time_ / total_time_ * 100); - fprintf(stderr, "pull sparse local time percent: %f\n", - pull_sparse_local_time_ / total_time_ * 100); - fprintf(stderr, "collect label time percent: %f\n", - collect_label_time_ / total_time_ * 100); - fprintf(stderr, "fill sparse time percent: %f\n", - fill_sparse_time_ / total_time_ * 100); - fprintf(stderr, "push sparse time percent: %f\n", - push_sparse_time_ / total_time_ * 100); - fprintf(stderr, "%6.2f instances/s\n", total_inst_ / total_time_); - } - } - - VLOG(3) << "done taskid = " << task->taskid_; - task->scope_->DropKids(); - object_pool_.Push(task); - } -} - -} // end namespace framework -} // end namespace paddle -#endif diff --git a/paddle/fluid/framework/heterxpu_trainer.cc b/paddle/fluid/framework/heterxpu_trainer.cc index 5e1fabf2038cc26d4da555b712cbb3199854d686..8049a1c9424bebf271f55c1247f1277a0836d88d 100644 --- a/paddle/fluid/framework/heterxpu_trainer.cc +++ b/paddle/fluid/framework/heterxpu_trainer.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/fleet/heter_wrapper.h" #include "paddle/fluid/framework/trainer.h" #if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index 89dc5c7d3ea932388fd8ab220478bb438f6b35f8..0c66622ed7b9a6a6e9fb5112001009c2b95e367a 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -150,6 +150,9 @@ void HogwildWorker::TrainFilesWithProfiler() { VLOG(3) << "Going to run op " << op_name[i]; if (!need_skip) { ops_[i]->Run(*thread_scope_, place_); +#ifdef PADDLE_WITH_HETERPS + dev_ctx_->Wait(); +#endif } VLOG(3) << "Op " << op_name[i] << " Finished"; timeline.Pause(); @@ -167,6 +170,16 @@ void HogwildWorker::TrainFilesWithProfiler() { total_inst += cur_batch; ++batch_cnt; PrintFetchVars(); +#ifdef PADDLE_WITH_HETERPS + dev_ctx_->Wait(); + VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " << total_time + << " seconds, ins_num: " << total_inst; + for (size_t i = 0; i < op_name.size(); ++i) { + VLOG(1) << "card:" << thread_id_ << ", op: " << op_name[i] + << ", mean time: " << op_total_time[i] / total_inst + << "s, totol time:" << op_total_time[i] << "sec"; + } +#else if (thread_id_ == 0) { if (batch_cnt > 0 && batch_cnt % 100 == 0) { for (size_t i = 0; i < ops_.size(); ++i) { @@ -178,6 +191,7 @@ void HogwildWorker::TrainFilesWithProfiler() { fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time); } } +#endif thread_scope_->DropKids(); timeline.Start(); } @@ -195,7 +209,10 @@ void HogwildWorker::TrainFilesWithProfiler() { void HogwildWorker::TrainFiles() { platform::SetNumThreads(1); + platform::Timer timeline; + timeline.Start(); + int total_ins_num = 0; // how to accumulate fetched values here device_reader_->Start(); int cur_batch; @@ -213,9 +230,13 @@ void HogwildWorker::TrainFiles() { } } + total_ins_num += cur_batch; PrintFetchVars(); thread_scope_->DropKids(); } + timeline.Pause(); + VLOG(3) << "worker " << thread_id_ << " train cost " << timeline.ElapsedSec() + << " seconds, ins_num: " << total_ins_num; #if defined PADDLE_WITH_PSCORE if (thread_barrier_) { paddle::distributed::Communicator::GetInstance()->BarrierTriggerDecrement(); diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc index 932b44ef351bb67a68f15196acd5f0d9ea59102e..b8aca886e7d60d9ca2e9595ba5063858a4a3ee29 100644 --- a/paddle/fluid/framework/io/fs.cc +++ b/paddle/fluid/framework/io/fs.cc @@ -240,16 +240,16 @@ void set_download_command(const std::string& x) { std::shared_ptr hdfs_open_read(std::string path, int* err_no, const std::string& converter) { - if (fs_end_with_internal(path, ".gz")) { - path = string::format_string("%s -text \"%s\"", hdfs_command().c_str(), + if (download_cmd() != "") { // use customized download command + path = string::format_string("%s \"%s\"", download_cmd().c_str(), path.c_str()); } else { - const std::string file_path = path; - path = string::format_string("%s -cat \"%s\"", hdfs_command().c_str(), - file_path.c_str()); - if (download_cmd() != "") { // use customized download command - path = string::format_string("%s \"%s\"", download_cmd().c_str(), - file_path.c_str()); + if (fs_end_with_internal(path, ".gz")) { + path = string::format_string("%s -text \"%s\"", hdfs_command().c_str(), + path.c_str()); + } else { + path = string::format_string("%s -cat \"%s\"", hdfs_command().c_str(), + path.c_str()); } } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0ca78c679aecaa396b59c7d50471baee239ba622..0107f5976499ce3d29673c5203809390e7da3d8c 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -50,8 +50,9 @@ if (WITH_TESTING) endif(WITH_TESTING) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS ${GRAPH_PATTERN_DETECTOR_DEPS}) +cc_library(op_compat_sensible_pass SRCS op_compat_sensible_pass.cc DEPS graph_pattern_detector op_def_api) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS graph_pattern_detector executor) -cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) +cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS op_compat_sensible_pass) cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) @@ -86,6 +87,7 @@ pass_library(quant_conv2d_dequant_fuse_pass inference) pass_library(shuffle_channel_detect_pass inference) pass_library(delete_quant_dequant_op_pass inference) pass_library(delete_quant_dequant_filter_op_pass inference) +pass_library(delete_dropout_op_pass inference) pass_library(simplify_with_basic_ops_pass base) pass_library(fc_elementwise_layernorm_fuse_pass base) pass_library(skip_layernorm_fuse_pass base) @@ -138,6 +140,7 @@ cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) +cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) cc_test(test_fc_fuse_pass_cc SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_fc_lstm_fuse_pass_cc SRCS fc_lstm_fuse_pass_tester.cc DEPS fc_lstm_fuse_pass framework_proto) cc_test(test_fc_gru_fuse_pass_cc SRCS fc_gru_fuse_pass_tester.cc DEPS fc_gru_fuse_pass framework_proto) @@ -168,7 +171,7 @@ if (WITH_MKLDNN) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util) cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util) cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util) - set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context) + set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context eigen_function) if (WITH_GPU OR WITH_ROCM) set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv) endif() @@ -185,4 +188,6 @@ endif() cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass) cc_test(test_multi_gru_fuse_pass SRCS mkldnn/multi_gru_fuse_pass_tester.cc DEPS multi_gru_fuse_pass) cc_test(test_multi_gru_seq_fuse_pass SRCS mkldnn/multi_gru_seq_fuse_pass_tester.cc DEPS multi_gru_seq_fuse_pass) + set(TEST_FC_RNN_PASS_DEPS fc_gru_fuse_pass fc_lstm_fuse_pass mkldnn_placement_pass) + cc_test(test_fc_rnn_mkldnn_fuse_pass SRCS mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc DEPS ${TEST_FC_RNN_PASS_DEPS}) endif () diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc index 62d79f987a6702e4240b44e49af4ff047173505f..0e2bb3eaad536fd9e3556f640b76e591bbf2f988 100644 --- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc +++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc @@ -24,6 +24,46 @@ namespace paddle { namespace framework { namespace ir { +AdaptivePool2dConvertGlobalPass::AdaptivePool2dConvertGlobalPass() { + AddOpCompat(OpCompat("pool2d")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("pooling_type") + .IsStringIn({"max", "avg"}) + .End() + .AddAttr("ksize") + .IsType>() + .End() + .AddAttr("global_pooling") + .IsBoolEQ(true) + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("exclusive") + .IsType() + .End() + .AddAttr("adaptive") + .IsBoolEQ(false) + .End() + .AddAttr("ceil_mode") + .IsType() + .End() + .AddAttr("data_format") + .IsStringIn({"NHWC", "NCHW"}) + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End(); +} + void AdaptivePool2dConvertGlobalPass::ApplyImpl(ir::Graph* graph) const { std::string name_scope = "adaptive_pool2d_convert_global_pass"; FusePassBase::Init(name_scope, graph); diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h index f16f030d518d02a43e9d0462ccab83f313a1dc34..4a1405004e247dff69635f7ebd766ae030da82e5 100644 --- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h +++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.h @@ -31,6 +31,7 @@ class Graph; */ class AdaptivePool2dConvertGlobalPass : public FusePassBase { public: + AdaptivePool2dConvertGlobalPass(); virtual ~AdaptivePool2dConvertGlobalPass() {} protected: diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index 34c6777195f84343a6272e99602081ad8efab714..8f6c6968f60dd8318ad0d5b1f2aec11b033d430f 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -23,6 +23,61 @@ namespace paddle { namespace framework { namespace ir { +AttentionLSTMFusePass::AttentionLSTMFusePass() { + AddOpCompat(OpCompat("while")) + .AddInput("X") // A set of variables, unconstrained + .End() + .AddInput("Condition") // An scalar + .IsTensor() + .End() + .AddOutput("Out") // A set of variables, unconstrained + .End() + .AddOutput("StepScopes") // A vector of local scope, unconstrained + .End() + .AddAttr("sub_block") + .IsType() + .End(); + + AddOpCompat(OpCompat("fill_constant")) + .AddInput("ValueTensor") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensorList") // vector> + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("dtype") + .IsNumGE(0) + .IsNumLE(25) + .End() + .AddAttr("shape") + .IsType>() + .End() + .AddAttr("value") + .IsType() + .End(); + + AddOpCompat(OpCompat("sequence_expand")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("ref_level") + .IsNumGE(-1) + .End(); +} struct Param { std::string X = "concat_0.tmp_0"; std::string C0 = "cell_init"; @@ -43,7 +98,7 @@ struct Param { void PrepareParameters(Graph* graph, const Param& param, ir::Node* lstm_op); -void FindWhileOp(Graph* graph) { +void AttentionLSTMFusePass::FindWhileOp(Graph* graph) const { GraphPatternDetector gpd; std::unordered_set fused_external_ops( {35, 36, 37, 38, 43, 44, 49, 45, 46, 47, 41, 42, 53, 54, 48, @@ -60,6 +115,10 @@ void FindWhileOp(Graph* graph) { auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } auto* while_pat_node = gpd.pattern().RetrieveNode("while"); auto* while_node = subgraph.at(while_pat_node); marked_nodes.insert(while_node); diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h index 48e3989a5314c613209718a313b076f4ce208ebc..5d4896a6db103cdb83ff12ee14109047a6ab4fc4 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h @@ -23,8 +23,14 @@ namespace ir { class Graph; class AttentionLSTMFusePass : public FusePassBase { + public: + AttentionLSTMFusePass(); + protected: void ApplyImpl(ir::Graph* graph) const override; + + private: + void FindWhileOp(Graph* graph) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc index 56d5831f3329b94d06940107f99150616b03eeb9..e4ac89f04ff6792dd9b05dedb623cec52598df99 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -94,6 +94,77 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, } } +ConvAffineChannelFusePass::ConvAffineChannelFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC"}) + .End(); + + AddOpCompat(OpCompat("affine_channel")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("data_layout") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); +} + void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -116,6 +187,11 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { int found_conv_ac_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed."; + return; + } + VLOG(4) << "handle ConvAffineChannel fuse"; GET_CONV_BN_NODES(conv_ac_pattern); @@ -149,6 +225,7 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { desc.SetType("elementwise_add"); desc.SetAttr("axis", 1); desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists("use_mkldnn")); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); @@ -164,6 +241,75 @@ void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_conv_ac_count); } +ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC"}) + .End(); + AddOpCompat(OpCompat("affine_channel")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("data_layout") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); +} + void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -186,6 +332,12 @@ void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { int found_conv_ac_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "ConvEltwiseAddAffineChannelFusePass in op compat failed."; + return; + } + VLOG(4) << "handle ConvBN fuse"; GET_CONV_BN_NODES(conv_ac_pattern); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h index 916384ec44704537f472c8b99bc5766489bd1ced..8cfaf5c6a89f06b453dbbc94b5a7fe8b83e5c111 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -31,6 +31,7 @@ class Graph; class ConvAffineChannelFusePass : public FusePassBase { public: + ConvAffineChannelFusePass(); virtual ~ConvAffineChannelFusePass() {} protected: @@ -40,6 +41,7 @@ class ConvAffineChannelFusePass : public FusePassBase { class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { public: + ConvEltwiseAddAffineChannelFusePass(); virtual ~ConvEltwiseAddAffineChannelFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index 9cc44c941eca19ddcc9f5ce42f913d711b1810fe..c362eec34b068347032cffd5feda7a3f49abb6d9 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -140,6 +140,100 @@ void recompute_bias_and_weights(const Scope* scope, } } +ConvBNFusePass::ConvBNFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("batch_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddInput("Mean") + .IsTensor() + .End() + .AddInput("Variance") + .IsTensor() + .End() + .AddOutput("MeanOut") + .IsTensor() + .End() + .AddOutput("VarianceOut") + .IsTensor() + .End() + .AddOutput("SavedMean") + .IsTensor() + .End() + .AddOutput("SavedVariance") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("ReserveSpace") + .IsTensor() + .IsOptional() + .End() + .AddAttr("epsilon") + .IsNumLE(0.001f) + .IsNumGE(0.0f) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); +} + void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -161,8 +255,11 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { int found_conv_bn_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "handle " + conv_type() + "BN fuse"; - // conv, batch_norm, // conv_weight, conv_out, // bn_scale, bn_bias, bn_mean, bn_variance, @@ -236,6 +333,10 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { } conv->Op()->SetOutput("Output", std::vector({bn_out->Name()})); + if (!IsCompat(*conv->Op())) { + LOG(WARNING) << "conv_bn fuse pass in out conv op compat failed."; + return; + } GraphSafeRemoveNodes( graph, {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, @@ -251,6 +352,11 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { desc.SetOutput("Out", std::vector({bn_out->Name()})); desc.SetType("elementwise_add"); desc.SetAttr("axis", 1); + if (!IsCompat(desc)) { + LOG(WARNING) + << "conv_bn fuse pass in out elementwise_add op compat failed."; + return; + } auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. GraphSafeRemoveNodes(graph, {bn_scale, bn_bias, bn_mean, bn_variance, @@ -269,6 +375,100 @@ void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_conv_bn_count); } +ConvEltwiseAddBNFusePass::ConvEltwiseAddBNFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .IsOptional() + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("batch_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddInput("Mean") + .IsTensor() + .End() + .AddInput("Variance") + .IsTensor() + .End() + .AddOutput("MeanOut") + .IsTensor() + .End() + .AddOutput("VarianceOut") + .IsTensor() + .End() + .AddOutput("SavedMean") + .IsTensor() + .End() + .AddOutput("SavedVariance") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("ReserveSpace") + .IsTensor() + .IsOptional() + .End() + .AddAttr("epsilon") + .IsNumLE(0.001f) + .IsNumGE(0.0f) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); +} + void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -290,8 +490,11 @@ void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const { int found_conv_bn_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "handle " + conv_type() + "BN fuse"; - // conv, batch_norm, // conv_weight, conv_out, // bn_scale, bn_bias, bn_mean, bn_variance, @@ -361,7 +564,11 @@ void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const { // Update the elementwise_add node eltwise->Op()->SetAttr("axis", 1); eltwise->Op()->SetOutput("Out", std::vector({bn_out->Name()})); - + if (!IsCompat(*eltwise->Op())) { + LOG(WARNING) + << "conv_eltwise_bn fuse pass in out eltwise op compat failed."; + return; + } GraphSafeRemoveNodes( graph, {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, @@ -377,6 +584,132 @@ void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_conv_bn_count); } +ConvTransposeBNFusePass::ConvTransposeBNFusePass() { + AddOpCompat(OpCompat("conv2d_transpose")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("output_padding") + .IsType>() + .IsOptional() + .End() + .AddAttr("output_size") + .IsType>() + .IsOptional() + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); +} + +ConvTransposeEltwiseAddBNFusePass::ConvTransposeEltwiseAddBNFusePass() { + AddOpCompat(OpCompat("conv2d_transpose")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("output_padding") + .IsType>() + .IsOptional() + .End() + .AddAttr("output_size") + .IsType>() + .IsOptional() + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); +} + +DepthwiseConvBNFusePass::DepthwiseConvBNFusePass() { + AddOpCompat(OpCompat("depthwise_conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h index 342cd8dad5fb959a11df6c50fda4f22bb73ec5ba..b976aab0eeae20aa3599925dd5684744fca39a91 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h @@ -17,8 +17,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -27,11 +25,10 @@ namespace ir { /* * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp. */ -class Graph; class ConvBNFusePass : public FusePassBase { public: - virtual ~ConvBNFusePass() {} + ConvBNFusePass(); virtual std::string conv_type() const { return "conv2d"; } protected: @@ -41,7 +38,7 @@ class ConvBNFusePass : public FusePassBase { class ConvEltwiseAddBNFusePass : public FusePassBase { public: - virtual ~ConvEltwiseAddBNFusePass() {} + ConvEltwiseAddBNFusePass(); virtual std::string conv_type() const { return "conv2d"; } protected: @@ -51,16 +48,19 @@ class ConvEltwiseAddBNFusePass : public FusePassBase { class ConvTransposeBNFusePass : public ConvBNFusePass { public: + ConvTransposeBNFusePass(); std::string conv_type() const { return "conv2d_transpose"; } }; class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass { public: + ConvTransposeEltwiseAddBNFusePass(); std::string conv_type() const { return "conv2d_transpose"; } }; class DepthwiseConvBNFusePass : public ConvBNFusePass { public: + DepthwiseConvBNFusePass(); std::string conv_type() const { return "depthwise_conv2d"; } }; diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index e7656171700b4ff7dda665b985521902518d7720..573436d393b85508d948c38b869b608cd58e5b05 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -52,6 +52,57 @@ framework::proto::OpDesc PrepareOpDesc( desc.Flush(); return *desc.Proto(); } +ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .End() + .AddAttr("paddings") + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .End() + .AddAttr("data_format") + .IsStringIn({"NHWC", "NCHW"}) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + // the first elementwise_add-axis needs to be 1, the second has to be -1 + // or 0 + .IsIntIn({1, -1, 0}) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add2_act_fuse"; @@ -66,6 +117,10 @@ void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass op compat failed."; + return; + } GET_NODES; auto base_op_desc = *conv_op->Op()->Proto(); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h index e68f57d4ae998203c6f34aee7cca11d69a5e6d3f..3d5e5788fed2d002a63a0a6149b06be1f54e015a 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -24,6 +24,7 @@ class Graph; class ConvElementwiseAdd2ActFusePass : public FusePassBase { public: + ConvElementwiseAdd2ActFusePass(); virtual ~ConvElementwiseAdd2ActFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc index ac6e22862d6299d193c9baa342c8ce5a6f2c56e6..c89984f384691760a4a9032778cac99c73eede13 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -48,6 +48,60 @@ framework::proto::OpDesc PrepareOpDesc( return *desc.Proto(); } +ConvElementwiseAddActFusePass::ConvElementwiseAddActFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("ResidualData") + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .End() + .AddAttr("paddings") + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} + void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_act_fuse"; FusePassBase::Init(pattern_name, graph); @@ -63,6 +117,10 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } GET_NODES; auto base_op_desc = *conv_op->Op()->Proto(); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h index 933092c7db7d38d722af9392e71cd0c1797f0eee..d28f212f49e71be92ea9e9d0eff1683fb67c3566 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -24,6 +24,7 @@ class Graph; class ConvElementwiseAddActFusePass : public FusePassBase { public: + ConvElementwiseAddActFusePass(); virtual ~ConvElementwiseAddActFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc index 170b8fb8c80fa78884c3f4f69ebe892bc5b2908c..248a71ede14beb35db0580b879891d5b3b614157 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -29,6 +29,52 @@ namespace ir { GET_IR_NODE(elementwise_add_in_y); \ GET_IR_NODE(elementwise_add_out); +ConvElementwiseAddFusePass::ConvElementwiseAddFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("ResidualData") + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .End() + .AddAttr("paddings") + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); +} + void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_fuse"; FusePassBase::Init(pattern_name, graph); @@ -44,6 +90,10 @@ void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } GET_NODES; auto base_op_desc = *conv_op->Op()->Proto(); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h index 7198a7488e052b5bdbe52d662b903d9f90c51da0..0913dc5c0022714e4013b718ab177862726dc911 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -24,6 +24,7 @@ class Graph; class ConvElementwiseAddFusePass : public FusePassBase { public: + ConvElementwiseAddFusePass(); virtual ~ConvElementwiseAddFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..09962239a01b1839bea93846ca3ffe9ded3cca4e --- /dev/null +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/fluid/framework/ir/delete_dropout_op_pass.h" + +namespace paddle { +namespace framework { +class LoDTensor; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(any_op_out); \ + GET_IR_NODE(dropout_op); \ + GET_IR_NODE(dropout_op_out); \ + GET_IR_NODE(dropout_op_outmask); \ + GET_IR_NODE(any_op2); + +void DeleteDropoutOpPass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "delete_dropout_op_pattern"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + patterns::DeleteDropoutOpPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + IR_NODE_LINK_TO(any_op_out, any_op2); + std::string any_op_out_name = any_op_out->Var()->Name(); + std::string dropout_op_out_name = dropout_op_out->Var()->Name(); + + auto* any_op2_desc = any_op2->Op(); + auto var_map = any_op2_desc->Inputs(); + std::string arg_name = ""; + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + dropout_op_out_name) != name_m.second.end()) { + arg_name = name_m.first; + } + } + if (arg_name.size() == 0) { + LOG(INFO) << "Delete dropout op pass: can not find the input " + << dropout_op_out_name; + return; + } + + // modify the any_op2's inputs + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + dropout_op_out_name) != name_m.second.end()) { + std::vector new_inputs; + for (auto& i_n : name_m.second) { + if (i_n != dropout_op_out_name) { + new_inputs.push_back(i_n); + } + } + new_inputs.push_back(any_op_out_name); + any_op2_desc->SetInput(name_m.first, new_inputs); + any_op2_desc->Flush(); + } + } + any_op2_desc->Flush(); + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, + {dropout_op, dropout_op_out, dropout_op_outmask}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(delete_dropout_op_pass, + paddle::framework::ir::DeleteDropoutOpPass); diff --git a/paddle/fluid/operators/reverse_op.cu b/paddle/fluid/framework/ir/delete_dropout_op_pass.h similarity index 54% rename from paddle/fluid/operators/reverse_op.cu rename to paddle/fluid/framework/ir/delete_dropout_op_pass.h index 635c41529b38f2dd287b00ed2e5659e11f619e78..c49abf3c871ced474bc47e28ec32d29bc9ccf750 100644 --- a/paddle/fluid/operators/reverse_op.cu +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.h @@ -12,13 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/reverse_op.h" +#pragma once +#include -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - reverse, ops::ReverseKernel, - ops::ReverseKernel, - ops::ReverseKernel, - ops::ReverseKernel, - ops::ReverseKernel, - ops::ReverseKernel) +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Graph; + +class DeleteDropoutOpPass : public FusePassBase { + public: + virtual ~DeleteDropoutOpPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc index 4379bba6380c598431cce76717742dc96af3a142..4ce91999207a2b1a8ad2a3ab594aa74f9aece8e3 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc @@ -32,6 +32,37 @@ namespace ir { GET_IR_NODE(quant_dequant_op_outscale); \ GET_IR_NODE(any_op2); +DeleteQuantDequantFilterOpPass::DeleteQuantDequantFilterOpPass() { + AddOpCompat(OpCompat("fake_quantize_dequantize_abs_max")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("OutScale") + .IsTensor() + .End() + .AddAttr("bit_length") + .IsIntIn({8, 16}) + .End(); + AddOpCompat(OpCompat("fake_channel_wise_quantize_dequantize_abs_max")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("OutScale") + .IsTensor() + .End() + .AddAttr("bit_length") + .IsIntIn({8, 16}) + .End() + .AddAttr("quant_axis") + .IsIntIn({0, 1}) + .End(); +} // Delete quant_dequant_op, then quantize and dequantize weight void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "delete_quantdequant_filter_op_pattern"; @@ -50,6 +81,11 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const { Graph* g) { GET_NODES; + if (!IsCompat(*quant_dequant_op->Op())) { + LOG(WARNING) << "quant_dequant_op in delete_quant_dequant_filter_op_pass " + "compat check failed."; + return; + } std::unordered_set nodes2rm = {}; int bit_length = BOOST_GET_CONST(int, quant_dequant_op->Op()->GetAttr("bit_length")); diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.h b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.h index 0409032d93816a2ba3121f2390aef5e59681ca9f..23049aac9622ee31609d8bf353f23a6f8ba3a6ff 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.h +++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.h @@ -16,16 +16,14 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { namespace ir { -class Graph; - class DeleteQuantDequantFilterOpPass : public FusePassBase { public: + DeleteQuantDequantFilterOpPass(); virtual ~DeleteQuantDequantFilterOpPass() {} protected: diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc index 48f79e63b4f0ea51df27695943690c1c36727e93..0f6421134c21655b9ffb4313d3459541d59a659e 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc @@ -136,8 +136,12 @@ void SkipLayerNorm::operator()() { ->LinksFrom({eltwise_add_out, layer_norm_bias_var, layer_norm_scale_var}) .LinksTo({layer_norm_out, layer_norm_mean_var, layer_norm_variance_var}); } -static int BuildFusion(Graph* graph, const std::string& name_scope - /*const Scope* scope*/) { + +} // namespace patterns + +int EmbeddingEltwiseLayerNormFusePass::BuildFusion( + Graph* graph, const std::string& name_scope + /*const Scope* scope*/) const { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); @@ -146,7 +150,8 @@ static int BuildFusion(Graph* graph, const std::string& name_scope std::vector> start_pattern_remove_nodes; // Create pattern. - Embedding2Eltwise1Pattern start_pattern(pattern, name_scope + "/start"); + patterns::Embedding2Eltwise1Pattern start_pattern(pattern, + name_scope + "/start"); start_pattern(); auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -162,6 +167,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope start_pattern); GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, start_pattern); GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, start_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass(Embedding2Eltwise1Pattern) in op compat failed."; + return; + } std::vector> ins; ins.push_back(std::make_pair(lookup_table1_x, lookup_table1_w)); ins.push_back(std::make_pair(lookup_table2_x, lookup_table2_w)); @@ -182,7 +191,8 @@ static int BuildFusion(Graph* graph, const std::string& name_scope GraphPatternDetector gpd2; auto* pattern2 = gpd2.mutable_pattern(); - Embedding1Eltwise1Pattern second_pattern(pattern2, name_scope + "/second"); + patterns::Embedding1Eltwise1Pattern second_pattern(pattern2, + name_scope + "/second"); second_pattern(); auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -194,6 +204,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_in, eltwise_add_in, second_pattern); GET_IR_NODE_FROM_SUBGRAPH(eltwise_add, eltwise_add, second_pattern); GET_IR_NODE_FROM_SUBGRAPH(eltwise_add_out, eltwise_add_out, second_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass(Embedding1Eltwise1Pattern) in op compat failed."; + return; + } auto in = std::make_pair(lookup_table1_x, lookup_table1_w); inner_pattern_ins.push_back(in); inner_pattern_tmp_in.push_back(eltwise_add_in); @@ -214,7 +228,8 @@ static int BuildFusion(Graph* graph, const std::string& name_scope std::vector> end_pattern_remove_nodes; GraphPatternDetector gpd3; auto* pattern3 = gpd3.mutable_pattern(); - SkipLayerNorm skip_layernorm_pattern(pattern3, name_scope + "/third"); + patterns::SkipLayerNorm skip_layernorm_pattern(pattern3, + name_scope + "/third"); skip_layernorm_pattern(); auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -232,6 +247,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope skip_layernorm_pattern); GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, skip_layernorm_pattern); + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass(SkipLayerNorm) in op compat failed."; + return; + } end_pattern_elt_out.push_back(eltwise_add_out); std::unordered_set rm_nodes; rm_nodes.insert({layer_norm, layer_norm_mean, layer_norm_variance}); @@ -349,11 +368,53 @@ static int BuildFusion(Graph* graph, const std::string& name_scope return fusion_count; } -} // namespace patterns +EmbeddingEltwiseLayerNormFusePass::EmbeddingEltwiseLayerNormFusePass() { + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({0, -1}) + .End(); + + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .End() + .AddOutput("Variance") + .IsTensor() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); +} void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const { FusePassBase::Init(name_scope_, graph); - int fusion_count = patterns::BuildFusion(graph, name_scope_); + int fusion_count = + EmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_); if (fusion_count > 0) { graph->Set(kEmbEltwiseLayernormPass, new bool(true)); } diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h index 25049d7468b152e72ad5f32fb38d9204f7219dff..fac9b49e886cb3ed55992cffe2c90c8fa5607dba 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.h @@ -19,8 +19,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -150,11 +148,13 @@ struct SkipLayerNorm : public PatternBase { class EmbeddingEltwiseLayerNormFusePass : public FusePassBase { public: + EmbeddingEltwiseLayerNormFusePass(); virtual ~EmbeddingEltwiseLayerNormFusePass() {} protected: void ApplyImpl(Graph* graph) const; - + int BuildFusion(Graph* graph, const std::string& name_scope + /*const Scope* scope*/) const; const std::string name_scope_{"embedding_eltwise_layernorm_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc index ef5b3c3c96e2374ef0cabc1ed8fc4bbab9577388..d3cf3319adfc5eaed5ce285bef86b81991d7350a 100644 --- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc @@ -136,6 +136,70 @@ static bool IsEqual(const std::vector &x, const std::vector &y) { return true; } +FCElementwiseLayerNormFusePass::FCElementwiseLayerNormFusePass() { + AddOpCompat(OpCompat("fc")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("W") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("in_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("activation_type") + .IsStringIn({"relu", ""}) + .End(); + + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsOptional() + .End() + .AddOutput("Variance") + .IsOptional() + .End() + + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 0}) + .End(); +} + void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { PADDLE_ENFORCE_NOT_NULL(graph, platform::errors::InvalidArgument( @@ -159,6 +223,11 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { return; } + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } + VLOG(4) << "handle FCElementwiseLayerNorm fuse"; GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fused_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_w, fc_w, fused_pattern); diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h index 12e4c44b84e87bb710774ebba0ba2853d8b37f5e..0e8f9866c765c2fb9d8c0199a2a02fccee2c6c12 100644 --- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h @@ -24,6 +24,7 @@ class Graph; class FCElementwiseLayerNormFusePass : public FusePassBase { public: + FCElementwiseLayerNormFusePass(); virtual ~FCElementwiseLayerNormFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index bc1be79d1b1688690965bf772c011d774ae1da78..0bb2782b3737ee3130e2d7bee68fd932c3b87932 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/framework/ir/fc_fuse_pass.h" - #include #include "paddle/fluid/framework/op_version_registry.h" @@ -23,6 +22,67 @@ namespace paddle { namespace framework { namespace ir { +FCFusePass::FCFusePass() { + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumGE(1) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("fc")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("W") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("in_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("activation_type") + .IsStringIn({"relu", ""}) + .End(); +} + void FCFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -52,6 +112,10 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { LOG(WARNING) << "The subgraph is empty."; return; } + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "handle FC fuse"; GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); @@ -159,6 +223,11 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { } desc.Flush(); + if (!IsCompat(desc)) { + LOG(WARNING) << "Fc fuse pass in out fc op compat failed."; + return; + } + auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. if (with_relu) { GraphSafeRemoveNodes( diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h index f564bbb151854fe325975285b18d25b517336014..21ef17b65dc2cb8b630155693024b706864f64d5 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -30,6 +30,7 @@ class Graph; class FCFusePass : public FusePassBase { public: + FCFusePass(); virtual ~FCFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index cf35c1ac772da079159cb4ced2edc234d7325b1e..5046911036818c902844a35220101836b6404478 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -58,12 +58,12 @@ TEST(FCFusePass, basic) { auto* weights_0 = layers.data("weights_0", {}, true); auto* mul_out_0 = layers.mul(relu_out_0, weights_0); auto* bias_1 = layers.data("bias_1", {}, true); - auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1); + auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1, nullptr, 1); auto* relu_out_1 = layers.relu(add_out_0); auto* weights_1 = layers.data("weights_1", {}, true); auto* mul_out_1 = layers.mul(relu_out_1, weights_1); auto* bias_2 = layers.data("bias_2", {}, true); - auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2); + auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2, nullptr, 1); VLOG(4) << add_out_1; std::unique_ptr graph(new ir::Graph(layers.main_program())); diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index b1c62d40d4d7c7ea00528a35fde7eba5d80185f6..e1260f62ddb6499abf1794af386045bf0565c4b3 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -30,8 +30,137 @@ namespace ir { class Node; -static int BuildFusion(Graph* graph, const std::string& name_scope, - Scope* scope, bool with_fc_bias) { +MulGRUFusePass::MulGRUFusePass() { + AddOpCompat(OpCompat("gru")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("H0") + .IsTensor() + .IsOptional() + .End() + .AddInput("Weight") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("BatchGate") + .IsTensor() + .End() + .AddOutput("BatchResetHiddenPrev") + .IsTensor() + .End() + .AddOutput("BatchHidden") + .IsTensor() + .End() + .AddOutput("Hidden") + .IsTensor() + .End() + .AddAttr("activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("gate_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("is_reverse") + .IsType() + .End() + .AddAttr("origin_mode") + .IsType() + .IsOptional() + .End(); + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + +FCGRUFusePass::FCGRUFusePass() { + AddOpCompat(OpCompat("gru")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("H0") + .IsTensor() + .IsOptional() + .End() + .AddInput("Weight") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("BatchGate") + .IsTensor() + .End() + .AddOutput("BatchResetHiddenPrev") + .IsTensor() + .End() + .AddOutput("BatchHidden") + .IsTensor() + .End() + .AddOutput("Hidden") + .IsTensor() + .End() + .AddAttr("activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("gate_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("is_reverse") + .IsType() + .End() + .AddAttr("origin_mode") + .IsType() + .IsOptional() + .End(); + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumGE(-1) + .End(); +} + +int FCGRUFusePass::BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) const { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); @@ -47,8 +176,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, gru_pattern(fc_out); // Create New OpDesc - auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h, - Node* bias, Node* hidden, Node* fc_bias) { + auto gru_creator = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h, + Node* bias, Node* hidden, Node* fc_bias, + const bool use_mkldnn) { OpDesc op_desc; op_desc.SetType("fusion_gru"); @@ -67,6 +197,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, gru->Op()->GetAttrIfExists("origin_mode")); // TODO(TJ): This should be a option for infer op_desc.SetAttr("use_seq", true); + op_desc.SetAttr("use_mkldnn", use_mkldnn); op_desc.SetAttr("activation", gru->Op()->GetAttr("activation")); op_desc.SetAttr("gate_activation", gru->Op()->GetAttr("gate_activation")); @@ -131,6 +262,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, int fusion_count{0}; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } auto* x_n = subgraph.at(x); GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); @@ -149,6 +284,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, LOG(INFO) << "fc_gru_fuse_pass not supported when origin_mode=True."; return; } + const bool use_mkldnn = + (mul->Op()->GetAttrIfExists("use_mkldnn") && + gru->Op()->GetAttrIfExists("activation") == "tanh" && + gru->Op()->GetAttrIfExists("gate_activation") == + "sigmoid"); if (with_fc_bias) { GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); @@ -156,14 +296,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern); - gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias); + gru_creator(gru, x_n, w, Weight, Bias, Hidden, fc_bias, use_mkldnn); // Remove unneeded nodes. std::unordered_set marked_nodes( {mul, gru, elementwise_add, fc_out, mul_out, BatchGate, BatchResetHiddenPrev, BatchHidden}); GraphSafeRemoveNodes(graph, marked_nodes); } else { - gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr); + gru_creator(gru, x_n, w, Weight, Bias, Hidden, nullptr, use_mkldnn); // Remove unneeded nodes. std::unordered_set marked_nodes( {mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden}); @@ -182,8 +322,8 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init(name_scope_, graph); - int fusion_count = - BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/); + int fusion_count = MulGRUFusePass::BuildFusion( + graph, name_scope_, param_scope(), false /*with_fc_bias*/); AddStatis(fusion_count); } @@ -191,8 +331,8 @@ void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const { void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init(name_scope_, graph); - int fusion_count = - BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); + int fusion_count = FCGRUFusePass::BuildFusion( + graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); } diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h index 73f00504d34d5f1cfddbc3826f7a84e6925fc9f3..421f3ef46d7f5c974b513c477e8c4d25a097815d 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h @@ -18,7 +18,6 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -26,21 +25,22 @@ namespace ir { // The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op. -class Graph; - class FCGRUFusePass : public FusePassBase { public: + FCGRUFusePass(); virtual ~FCGRUFusePass() {} protected: void ApplyImpl(ir::Graph* graph) const override; - const std::string name_scope_{"fc_gru_fuse"}; + int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, + bool with_fc_bias) const; }; // Just FC without bias -class MulGRUFusePass : public FusePassBase { +class MulGRUFusePass : public FCGRUFusePass { public: + MulGRUFusePass(); virtual ~MulGRUFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.cc index 70351b8aafffa1a42c4ac4c3cd281f230ef956c8..6ec47fae26a932b26147b9811dd9d9a54cc1cccc 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.cc @@ -12,77 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" - -#include -#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h" namespace paddle { namespace framework { namespace ir { -void AddVarToScope(Scope* param_scope, const std::string& name, - const DDim& dims) { - auto* tensor = param_scope->Var(name)->GetMutable(); - tensor->Resize(dims); - tensor->mutable_data(platform::CPUPlace()); -} - -Scope* CreateParamScope() { - auto param_scope = new Scope(); - AddVarToScope(param_scope, "gru_fc_w", {}); - AddVarToScope(param_scope, "gru_fc_b", {}); - AddVarToScope(param_scope, "gru_w", {}); - AddVarToScope(param_scope, "gru_b", {}); - AddVarToScope(param_scope, "gru_batch_gate_0", {}); - AddVarToScope(param_scope, "gru_batch_reset_hidden_prev_0", {}); - AddVarToScope(param_scope, "gru_batch_hidden_0", {}); - AddVarToScope(param_scope, "gru_hidden_0", {}); - AddVarToScope(param_scope, "gru_batch_gate_1", {}); - AddVarToScope(param_scope, "gru_batch_reset_hidden_prev_1", {}); - AddVarToScope(param_scope, "gru_batch_hidden_1", {}); - AddVarToScope(param_scope, "gru_hidden_1", {}); - return param_scope; -} - -TEST(FCFusePass, basic) { - // inputs operator output - // -------------------------------------------------------- - // (a, gru_fc_w) mul -> fc_0_tmp_0 - // (fc_0_tmp_0, gru_fc_b) elementwise_add -> fc_0_tmp_1 - // (fc_0_tmp_1,gru_w,gru_b gru -> gru_out_0 - - // (b, gru_fc_w) mul -> fc_1_tmp_0 - // (fc_1_tmp_0, gru_fc_b) elementwise_add -> fc_1_tmp_1 - // (fc_1_tmp_1,gru_w,gru_b) gru -> gru_out_1 - Layers layers; - auto* a = layers.data("a"); - auto* b = layers.data("b"); - auto* fc_w = layers.data("gru_fc_w", {}, true); - auto* fc_b = layers.data("gru_fc_b", {}, true); - auto* gru_w = layers.data("gru_w", {}, true); - auto* gru_b = layers.data("gru_b", {}, true); - auto* fc_0_tmp0 = layers.mul(a, fc_w); - auto* fc_0_tmp1 = layers.elementwise_add(fc_0_tmp0, fc_b); - auto* gru_batch_gate_0 = layers.data("gru_batch_gate_0", {}, false); - auto* gru_batch_reset_hidden_prev_0 = - layers.data("gru_batch_reset_hidden_prev_0", {}, false); - auto* gru_batch_hidden_0 = layers.data("gru_batch_hidden_0", {}, false); - auto* gru_hidden_0 = layers.data("gru_hidden_0", {}, false); - layers.gru(fc_0_tmp1, gru_w, gru_b, gru_batch_gate_0, - gru_batch_reset_hidden_prev_0, gru_batch_hidden_0, gru_hidden_0); - - auto* fc_1_tmp0 = layers.mul(b, fc_w); - auto* fc_1_tmp1 = layers.elementwise_add(fc_1_tmp0, fc_b); - auto* gru_batch_gate_1 = layers.data("gru_batch_gate_1", {}, false); - auto* gru_batch_reset_hidden_prev_1 = - layers.data("gru_batch_reset_hidden_prev_1", {}, false); - auto* gru_batch_hidden_1 = layers.data("gru_batch_hidden_1", {}, false); - auto* gru_hidden_1 = layers.data("gru_hidden_1", {}, false); - layers.gru(fc_1_tmp1, gru_w, gru_b, gru_batch_gate_1, - gru_batch_reset_hidden_prev_1, gru_batch_hidden_1, gru_hidden_1); - - std::unique_ptr graph(new ir::Graph(layers.main_program())); +namespace fc_gru_test { +TEST(FcGruFusePass, basic) { + std::unique_ptr graph = PrepareGraph(); auto pass = PassRegistry::Instance().Get("fc_gru_fuse_pass"); pass->Set("use_gpu", new bool(true)); graph->Set("__param_scope__", CreateParamScope()); @@ -109,6 +47,7 @@ TEST(FCFusePass, basic) { "expectations after fuse")); } +} // namespace fc_gru_test } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h new file mode 100644 index 0000000000000000000000000000000000000000..a862755d604e44754f0905bb5f4c53d91daeadaf --- /dev/null +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h @@ -0,0 +1,96 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" + +#include +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace fc_gru_test { +void AddVarToScope(Scope* param_scope, const std::string& name, + const DDim& dims) { + auto* tensor = param_scope->Var(name)->GetMutable(); + tensor->Resize(dims); + tensor->mutable_data(platform::CPUPlace()); +} + +Scope* CreateParamScope() { + auto param_scope = new Scope(); + AddVarToScope(param_scope, "gru_fc_w", {}); + AddVarToScope(param_scope, "gru_fc_b", {}); + AddVarToScope(param_scope, "gru_w", {}); + AddVarToScope(param_scope, "gru_b", {}); + AddVarToScope(param_scope, "gru_batch_gate_0", {}); + AddVarToScope(param_scope, "gru_batch_reset_hidden_prev_0", {}); + AddVarToScope(param_scope, "gru_batch_hidden_0", {}); + AddVarToScope(param_scope, "gru_hidden_0", {}); + AddVarToScope(param_scope, "gru_batch_gate_1", {}); + AddVarToScope(param_scope, "gru_batch_reset_hidden_prev_1", {}); + AddVarToScope(param_scope, "gru_batch_hidden_1", {}); + AddVarToScope(param_scope, "gru_hidden_1", {}); + return param_scope; +} + +std::unique_ptr PrepareGraph( + std::string activation = "tanh", std::string gate_activation = "sigmoid") { + // inputs operator output + // -------------------------------------------------------- + // (a, gru_fc_w) mul -> fc_0_tmp_0 + // (fc_0_tmp_0, gru_fc_b) elementwise_add -> fc_0_tmp_1 + // (fc_0_tmp_1,gru_w,gru_b gru -> gru_out_0 + + // (b, gru_fc_w) mul -> fc_1_tmp_0 + // (fc_1_tmp_0, gru_fc_b) elementwise_add -> fc_1_tmp_1 + // (fc_1_tmp_1,gru_w,gru_b) gru -> gru_out_1 + Layers layers; + auto* a = layers.data("a"); + auto* b = layers.data("b"); + auto* fc_w = layers.data("gru_fc_w", {}, true); + auto* fc_b = layers.data("gru_fc_b", {}, true); + auto* gru_w = layers.data("gru_w", {}, true); + auto* gru_b = layers.data("gru_b", {}, true); + auto* fc_0_tmp0 = layers.mul(a, fc_w); + auto* fc_0_tmp1 = layers.elementwise_add(fc_0_tmp0, fc_b); + auto* gru_batch_gate_0 = layers.data("gru_batch_gate_0", {}, false); + auto* gru_batch_reset_hidden_prev_0 = + layers.data("gru_batch_reset_hidden_prev_0", {}, false); + auto* gru_batch_hidden_0 = layers.data("gru_batch_hidden_0", {}, false); + auto* gru_hidden_0 = layers.data("gru_hidden_0", {}, false); + layers.gru(fc_0_tmp1, gru_w, gru_b, gru_batch_gate_0, + gru_batch_reset_hidden_prev_0, gru_batch_hidden_0, gru_hidden_0, + nullptr, false, false, activation, gate_activation); + + auto* fc_1_tmp0 = layers.mul(b, fc_w); + auto* fc_1_tmp1 = layers.elementwise_add(fc_1_tmp0, fc_b); + auto* gru_batch_gate_1 = layers.data("gru_batch_gate_1", {}, false); + auto* gru_batch_reset_hidden_prev_1 = + layers.data("gru_batch_reset_hidden_prev_1", {}, false); + auto* gru_batch_hidden_1 = layers.data("gru_batch_hidden_1", {}, false); + auto* gru_hidden_1 = layers.data("gru_hidden_1", {}, false); + layers.gru(fc_1_tmp1, gru_w, gru_b, gru_batch_gate_1, + gru_batch_reset_hidden_prev_1, gru_batch_hidden_1, gru_hidden_1, + nullptr, false, false, activation, gate_activation); + + std::unique_ptr graph(new ir::Graph(layers.main_program())); + return std::move(graph); +} +} // namespace fc_gru_test +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index 1c1289124506ab4e3b1baf74211bea370c144380..35704f1f3309e1a91b18d7a2c30ee7dda3b57e51 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -29,8 +29,149 @@ namespace ir { class Node; -int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, - bool with_fc_bias) { +MulLstmFusePass::MulLstmFusePass() { + AddOpCompat(OpCompat("lstm")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("H0") + .IsTensor() + .IsOptional() + .End() + .AddInput("C0") + .IsTensor() + .IsOptional() + .End() + .AddInput("Weight") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Hidden") + .IsTensor() + .End() + .AddOutput("Cell") + .IsTensor() + .End() + .AddOutput("BatchGate") + .IsTensor() + .End() + .AddOutput("BatchCellPreAct") + .IsTensor() + .End() + .AddAttr("use_peepholes") + .IsType() + .End() + .AddAttr("is_reverse") + .IsType() + .End() + .AddAttr("gate_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("cell_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("candidate_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End(); + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + +FCLstmFusePass::FCLstmFusePass() { + AddOpCompat(OpCompat("lstm")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("H0") + .IsTensor() + .IsOptional() + .End() + .AddInput("C0") + .IsTensor() + .IsOptional() + .End() + .AddInput("Weight") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Hidden") + .IsTensor() + .End() + .AddOutput("Cell") + .IsTensor() + .End() + .AddOutput("BatchGate") + .IsTensor() + .End() + .AddOutput("BatchCellPreAct") + .IsTensor() + .End() + .AddAttr("use_peepholes") + .IsType() + .End() + .AddAttr("is_reverse") + .IsType() + .End() + .AddAttr("gate_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("cell_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End() + .AddAttr("candidate_activation") + .IsStringIn({"sigmoid", "tanh", "relu", "identity"}) + .End(); + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumGE(-1) + .End(); +} + +int FCLstmFusePass::BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) const { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); @@ -47,7 +188,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, // Create New OpDesc auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x, Node* weight_h, Node* bias, Node* hidden, Node* cell, - Node* xx, Node* fc_bias) { + Node* xx, Node* fc_bias, const bool use_mkldnn) { OpDesc op_desc; op_desc.SetType("fusion_lstm"); #define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()}); @@ -88,6 +229,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, op_desc.SetOutput("XX", {xx->Name()}); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); + op_desc.SetAttr("use_mkldnn", use_mkldnn); // TODO(TJ): get from attr op_desc.SetAttr("use_seq", true); @@ -139,6 +281,10 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); @@ -148,13 +294,22 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern); GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + const bool use_mkldnn = + (mul->Op()->GetAttrIfExists("use_mkldnn") && + lstm->Op()->GetAttrIfExists("gate_activation") == + "sigmoid" && + lstm->Op()->GetAttrIfExists("cell_activation") == + "tanh" && + lstm->Op()->GetAttrIfExists("candidate_activation") == + "tanh"); + if (with_fc_bias) { GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out, - fc_bias); + fc_bias, use_mkldnn); // Remove unneeded nodes. std::unordered_set marked_nodes( {mul, lstm, elementwise_add, mul_out, BatchGate, BatchCellPreAct}); @@ -162,7 +317,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, } else { GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out, - nullptr); + nullptr, use_mkldnn); // Remove unneeded nodes. std::unordered_set marked_nodes( {mul, lstm, BatchGate, BatchCellPreAct}); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h index d37f53b15f06b72e67c234baec3a314f0f462735..60b4953c2ec0a8c225d74a604d74433f344b2424 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -31,16 +31,19 @@ class Graph; class FCLstmFusePass : public FusePassBase { public: + FCLstmFusePass(); virtual ~FCLstmFusePass() {} protected: void ApplyImpl(ir::Graph* graph) const override; - + int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, + bool with_fc_bias) const; const std::string name_scope_{"fc_lstm_fuse"}; }; -class MulLstmFusePass : public FusePassBase { +class MulLstmFusePass : public FCLstmFusePass { public: + MulLstmFusePass(); virtual ~MulLstmFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc index 0de8d4684fecd45fd05e579b82b1f7ada11592dd..92de86e52bc0a55fd7258f6b65002d875f69049b 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.cc @@ -12,77 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" - -#include -#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h" namespace paddle { namespace framework { namespace ir { -void AddVarToScope(Scope* param_scope, const std::string& name, - const DDim& dims) { - auto* tensor = param_scope->Var(name)->GetMutable(); - tensor->Resize(dims); - tensor->mutable_data(platform::CPUPlace()); -} - -Scope* CreateParamScope() { - auto param_scope = new Scope(); - AddVarToScope(param_scope, "lstm_fc_w", {}); - AddVarToScope(param_scope, "lstm_fc_b", {}); - AddVarToScope(param_scope, "lstm_w", {}); - AddVarToScope(param_scope, "lstm_b", {}); - AddVarToScope(param_scope, "lstm_cell_0", {}); - AddVarToScope(param_scope, "lstm_batch_gate_0", {}); - AddVarToScope(param_scope, "lstm_batch_cell_pre_gate_0", {}); - AddVarToScope(param_scope, "lstm_hidden_0", {}); - AddVarToScope(param_scope, "lstm_cell_1", {}); - AddVarToScope(param_scope, "lstm_batch_gate_1", {}); - AddVarToScope(param_scope, "lstm_batch_cell_pre_gate_1", {}); - AddVarToScope(param_scope, "lstm_hidden_1", {}); - return param_scope; -} - -TEST(FCLSTMFusePass, basic) { - // inputs operator output - // -------------------------------------------------------- - // (a, lstm_fc_w) mul -> fc_0_tmp_0 - // (fc_0_tmp_0, lstm_fc_b) elementwise_add -> fc_0_tmp_1 - // fc_0_tmp_1,lstm_w,lstm_b lstm -> lstm_out_0 - - // (b, lstm_fc_w) mul -> fc_1_tmp_0 - // (fc_1_tmp_0, lstm_fc_b) elementwise_add -> fc_1_tmp_1 - // (fc_1_tmp_1,lstm_w,lstm_b) lstm -> lstm_out_1 - Layers layers; - auto* a = layers.data("a"); - auto* b = layers.data("b"); - auto* fc_w = layers.data("lstm_fc_w", {}, true); - auto* fc_b = layers.data("lstm_fc_b", {}, true); - auto* lstm_w = layers.data("lstm_w", {}, true); - auto* lstm_b = layers.data("lstm_b", {}, true); - auto* fc_0_tmp0 = layers.mul(a, fc_w); - auto* fc_0_tmp1 = layers.elementwise_add(fc_0_tmp0, fc_b); - auto* lstm_cell_0 = layers.data("lstm_cell_0", {}, false); - auto* lstm_batch_gate_0 = layers.data("lstm_batch_gate_0", {}, false); - auto* lstm_batch_cell_pre_gate_0 = - layers.data("lstm_batch_cell_pre_gate_0", {}, false); - auto* lstm_hidden_0 = layers.data("lstm_hidden_0", {}, false); - layers.lstm(fc_0_tmp1, lstm_w, lstm_b, lstm_cell_0, lstm_batch_gate_0, - lstm_hidden_0, lstm_batch_cell_pre_gate_0); +namespace fc_lstm_test { - auto* fc_1_tmp0 = layers.mul(b, fc_w); - auto* fc_1_tmp1 = layers.elementwise_add(fc_1_tmp0, fc_b); - auto* lstm_cell_1 = layers.data("lstm_cell_1", {}, false); - auto* lstm_batch_gate_1 = layers.data("lstm_batch_gate_1", {}, false); - auto* lstm_batch_cell_pre_gate_1 = - layers.data("lstm_batch_cell_pre_gate_1", {}, false); - auto* lstm_hidden_1 = layers.data("lstm_hidden_1", {}, false); - layers.lstm(fc_1_tmp1, lstm_w, lstm_b, lstm_cell_1, lstm_batch_gate_1, - lstm_hidden_1, lstm_batch_cell_pre_gate_1); - - std::unique_ptr graph(new ir::Graph(layers.main_program())); +TEST(FcLstmFusePass, basic) { + std::unique_ptr graph = PrepareGraph(); auto pass = PassRegistry::Instance().Get("fc_lstm_fuse_pass"); pass->Set("use_gpu", new bool(false)); graph->Set("__param_scope__", CreateParamScope()); @@ -108,7 +47,7 @@ TEST(FCLSTMFusePass, basic) { "The number of fusion_gru nodes does " "not meet expectations after fuse")); } - +} // namespace fc_lstm_test } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h new file mode 100644 index 0000000000000000000000000000000000000000..f681a2b7ff8eb02bf7a546daa2edefbdfcdc9539 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h @@ -0,0 +1,100 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" + +#include +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace fc_lstm_test { + +void AddVarToScope(Scope* param_scope, const std::string& name, + const DDim& dims) { + auto* tensor = param_scope->Var(name)->GetMutable(); + tensor->Resize(dims); + tensor->mutable_data(platform::CPUPlace()); +} + +Scope* CreateParamScope() { + auto param_scope = new Scope(); + AddVarToScope(param_scope, "lstm_fc_w", {}); + AddVarToScope(param_scope, "lstm_fc_b", {}); + AddVarToScope(param_scope, "lstm_w", {}); + AddVarToScope(param_scope, "lstm_b", {}); + AddVarToScope(param_scope, "lstm_cell_0", {}); + AddVarToScope(param_scope, "lstm_batch_gate_0", {}); + AddVarToScope(param_scope, "lstm_batch_cell_pre_gate_0", {}); + AddVarToScope(param_scope, "lstm_hidden_0", {}); + AddVarToScope(param_scope, "lstm_cell_1", {}); + AddVarToScope(param_scope, "lstm_batch_gate_1", {}); + AddVarToScope(param_scope, "lstm_batch_cell_pre_gate_1", {}); + AddVarToScope(param_scope, "lstm_hidden_1", {}); + return param_scope; +} + +std::unique_ptr PrepareGraph( + std::string gate_activation = "sigmoid", + std::string cell_activation = "tanh", + std::string candidate_activation = "tanh") { + // inputs operator output + // -------------------------------------------------------- + // (a, lstm_fc_w) mul -> fc_0_tmp_0 + // (fc_0_tmp_0, lstm_fc_b) elementwise_add -> fc_0_tmp_1 + // fc_0_tmp_1,lstm_w,lstm_b lstm -> lstm_out_0 + + // (b, lstm_fc_w) mul -> fc_1_tmp_0 + // (fc_1_tmp_0, lstm_fc_b) elementwise_add -> fc_1_tmp_1 + // (fc_1_tmp_1,lstm_w,lstm_b) lstm -> lstm_out_1 + Layers layers; + auto* a = layers.data("a"); + auto* b = layers.data("b"); + auto* fc_w = layers.data("lstm_fc_w", {}, true); + auto* fc_b = layers.data("lstm_fc_b", {}, true); + auto* lstm_w = layers.data("lstm_w", {}, true); + auto* lstm_b = layers.data("lstm_b", {}, true); + auto* fc_0_tmp0 = layers.mul(a, fc_w); + auto* fc_0_tmp1 = layers.elementwise_add(fc_0_tmp0, fc_b); + auto* lstm_cell_0 = layers.data("lstm_cell_0", {}, false); + auto* lstm_batch_gate_0 = layers.data("lstm_batch_gate_0", {}, false); + auto* lstm_batch_cell_pre_gate_0 = + layers.data("lstm_batch_cell_pre_gate_0", {}, false); + auto* lstm_hidden_0 = layers.data("lstm_hidden_0", {}, false); + layers.lstm(fc_0_tmp1, lstm_w, lstm_b, lstm_cell_0, lstm_batch_gate_0, + lstm_hidden_0, lstm_batch_cell_pre_gate_0, nullptr, nullptr, true, + false, gate_activation, cell_activation, candidate_activation); + auto* fc_1_tmp0 = layers.mul(b, fc_w); + auto* fc_1_tmp1 = layers.elementwise_add(fc_1_tmp0, fc_b); + auto* lstm_cell_1 = layers.data("lstm_cell_1", {}, false); + auto* lstm_batch_gate_1 = layers.data("lstm_batch_gate_1", {}, false); + auto* lstm_batch_cell_pre_gate_1 = + layers.data("lstm_batch_cell_pre_gate_1", {}, false); + auto* lstm_hidden_1 = layers.data("lstm_hidden_1", {}, false); + layers.lstm(fc_1_tmp1, lstm_w, lstm_b, lstm_cell_1, lstm_batch_gate_1, + lstm_hidden_1, lstm_batch_cell_pre_gate_1, nullptr, nullptr, true, + false, gate_activation, cell_activation, candidate_activation); + + std::unique_ptr graph(new ir::Graph(layers.main_program())); + return std::move(graph); +} + +} // namespace fc_lstm_test +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h index ce7635bb35ce6108b4a5a356c8fb99269dbf2890..bc5fc2a16d3939648f53e91f6cd3f4f0def0fd93 100644 --- a/paddle/fluid/framework/ir/fuse_pass_base.h +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -17,7 +17,7 @@ #include #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" #include "paddle/fluid/framework/scope.h" namespace paddle { @@ -46,7 +46,7 @@ enum FuseOptions { FUSE_MKLDNN // fusing will be done with MKL-DNN }; -class FusePassBase : public Pass { +class FusePassBase : public OpCompatSensiblePass { public: void Init(const std::string& repr, Graph* graph) const; Scope* param_scope() const; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d74e8e5f65cd2020433e9658ee9520d51c13387a..7717bcfc3e96249bd99b80525728718ee18300b5 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2262,11 +2262,26 @@ PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::Bfloat16Placement::operator()( const std::unordered_set &bfloat16_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set({"concat", "conv2d", "conv2d_transpose", - "elementwise_add", "elementwise_mul", - "fc", "fusion_gru", "gelu", "layer_norm", - "matmul", "pool2d", "relu", "reshape2", - "softmax", "sum", "transpose2"}); + std::unordered_set({"concat", + "conv2d", + "conv2d_transpose", + "elementwise_add", + "elementwise_mul", + "fc", + "fusion_gru", + "fusion_lstm", + "gelu", + "layer_norm", + "matmul", + "matmul_v2", + "pool2d", + "prelu", + "relu", + "reshape2", + "softmax", + "split", + "sum", + "transpose2"}); if (!bfloat16_enabled_op_types.empty()) { supported_op_types = bfloat16_enabled_op_types; } @@ -2340,16 +2355,7 @@ PDNode *patterns::DuplicatedInputs::operator()() { PDNode *patterns::MKLDNNInPlace::operator()() { const std::unordered_set &supported_op_types = { - "abs", - "elementwise_mul", - "elementwise_add", - "gelu", - "leaky_relu", - "relu", - "softmax", - "sqrt", - "swish", - "tanh"}; + "abs", "gelu", "leaky_relu", "relu", "softmax", "sqrt", "swish", "tanh"}; auto possible_inplace_op = pattern->NewNode(inplace_to_be_op_repr()) ->assert_is_ops(supported_op_types); @@ -2439,6 +2445,29 @@ PDNode *patterns::TransposeFlattenConcat::operator()( return concat_out; } +void patterns::DeleteDropoutOpPattern::operator()() { + auto any_op_out = pattern->NewNode(any_op_out_repr()) + ->assert_is_op_input("dropout", "X") + ->AsInput(); + + auto dropout_op = + pattern->NewNode(dropout_op_repr())->assert_is_op("dropout"); + + auto dropout_op_out = pattern->NewNode(dropout_op_out_repr()) + ->assert_is_op_output("dropout", "Out") + ->AsIntermediate(); + + auto dropout_op_outmask = pattern->NewNode(dropout_op_outmask_repr()) + ->assert_is_op_output("dropout", "Mask") + ->AsOutput(); + auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput(); + + dropout_op->LinksFrom({any_op_out}); + dropout_op_out->LinksFrom({dropout_op}); + dropout_op_outmask->LinksFrom({dropout_op}); + any_op2->LinksFrom({dropout_op_out}); +} + void patterns::DeleteQuantOpFuse::operator()(PDNode *input_act_node, const std::string &quant_type) { auto *input_scale_node = pattern->NewNode(GetNodeName("input_scale_node")) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index cfac01ec9dedc83af4bfdce30678f933d9a8e921..13f65859954d58ce446ab3b9de488833f6220dee 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1464,6 +1464,19 @@ struct ShuffleChannelPattern : public PatternBase { PATTERN_DECL_NODE(reshape2_out); }; +struct DeleteDropoutOpPattern : public PatternBase { + DeleteDropoutOpPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "delete_dropout_op_pattern") {} + + void operator()(); + + PATTERN_DECL_NODE(any_op_out); + PATTERN_DECL_NODE(dropout_op); + PATTERN_DECL_NODE(dropout_op_out); + PATTERN_DECL_NODE(dropout_op_outmask); + PATTERN_DECL_NODE(any_op2); +}; + struct DeleteQuantDequantOpPattern : public PatternBase { DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {} diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 18d2e9817ebec857e1b13d7d6e0e9f2201a69d94..95d55834f823bf0adf1b32537fc3e64eb088de92 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -99,6 +99,122 @@ void addIntermediateOut(Node* op_node, const std::string& out_name, } // namespace +LayerNormFusePass::LayerNormFusePass() { + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Variance") + .IsTensor() + .IsOptional() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); + AddOpCompat(OpCompat("reduce_mean")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("dim") + .IsType>() + .End() + .AddAttr("keep_dim") + .IsBoolEQ(true) + .End(); + AddOpCompat(OpCompat("sqrt")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + AddOpCompat(OpCompat("elementwise_sub")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + AddOpCompat(OpCompat("elementwise_pow")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + AddOpCompat(OpCompat("elementwise_div")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + AddOpCompat(OpCompat("elementwise_mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); +} + void LayerNormFusePass::ApplyImpl(Graph* graph) const { PADDLE_ENFORCE_NOT_NULL(graph, platform::errors::InvalidArgument( @@ -117,6 +233,10 @@ void LayerNormFusePass::ApplyImpl(Graph* graph) const { int found_layer_norm_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "Fuse LayerNorm from subgraph."; GET_IR_NODE_FROM_SUBGRAPH(x, x, layer_norm_pattern); GET_IR_NODE_FROM_SUBGRAPH(x_mean, x_mean, layer_norm_pattern); @@ -205,6 +325,12 @@ void LayerNormFusePass::ApplyImpl(Graph* graph) const { ln_op_desc.SetAttr("begin_norm_axis", static_cast(x_shape.size() - 1)); ln_op_desc.SetAttr("epsilon", *(eps_tensor->data())); ln_op_desc.SetAttr("is_test", true); + + if (!IsCompat(ln_op_desc)) { + LOG(WARNING) << "layer norm pass in out layer_norm op compat failed."; + return; + } + Node* ln_op = g->CreateOpNode(&ln_op_desc); addIntermediateOut(ln_op, "Mean", scope_name_, g); diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.h b/paddle/fluid/framework/ir/layer_norm_fuse_pass.h index 29a6f127065f6c2bfa3f885e44baa0f8df616a69..a9d49ea012d32dd85881ed4d16e4d35a1f1b4475 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.h +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.h @@ -70,6 +70,7 @@ namespace ir { */ class LayerNormFusePass : public FusePassBase { public: + LayerNormFusePass(); virtual ~LayerNormFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc index 5fe71fbc21451f13991cab4f612d251d028ac792..accfe8920a83c966368f7f20b7bb70fd1f1ab970 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc @@ -66,12 +66,16 @@ class LayerNormFuseTest { x_mean->SetAttr("keep_dim", true); x_mean->SetAttr("reduce_all", false); - test::CreateOp(&m_prog, "elementwise_sub", - {{"X", "x"}, {"Y", "x_mean_out"}}, - {{"Out", "x_sub_mean_out"}}, false); - test::CreateOp(&m_prog, "elementwise_pow", - {{"X", "x_sub_mean_out"}, {"Y", "sqr_pow"}}, - {{"Out", "x_sub_mean_sqr_out"}}, false); + auto* x_sub = test::CreateOp(&m_prog, "elementwise_sub", + {{"X", "x"}, {"Y", "x_mean_out"}}, + {{"Out", "x_sub_mean_out"}}, false); + x_sub->SetAttr("axis", 1); + + auto* x_pow = test::CreateOp(&m_prog, "elementwise_pow", + {{"X", "x_sub_mean_out"}, {"Y", "sqr_pow"}}, + {{"Out", "x_sub_mean_sqr_out"}}, false); + x_pow->SetAttr("axis", 1); + auto* std_dev = test::CreateOp(&m_prog, "reduce_mean", {{"X", "x_sub_mean_sqr_out"}}, {{"Out", "std_dev_out"}}, false); @@ -79,20 +83,29 @@ class LayerNormFuseTest { std_dev->SetAttr("keep_dim", true); std_dev->SetAttr("reduce_all", false); - test::CreateOp(&m_prog, "elementwise_add", - {{"X", "std_dev_out"}, {"Y", "eps"}}, - {{"Out", "std_dev_eps_out"}}, false); + auto* x_add = test::CreateOp(&m_prog, "elementwise_add", + {{"X", "std_dev_out"}, {"Y", "eps"}}, + {{"Out", "std_dev_eps_out"}}, false); + x_add->SetAttr("axis", 1); + test::CreateOp(&m_prog, "sqrt", {{"X", "std_dev_eps_out"}}, {{"Out", "std_dev_eps_sqrt_out"}}, false); - test::CreateOp(&m_prog, "elementwise_div", - {{"X", "x_sub_mean_out"}, {"Y", "std_dev_eps_sqrt_out"}}, - {{"Out", "division_out"}}, false); - test::CreateOp(&m_prog, "elementwise_mul", - {{"X", "division_out"}, {"Y", "gamma"}}, - {{"Out", "scale_out"}}, false); - test::CreateOp(&m_prog, "elementwise_add", - {{"X", "scale_out"}, {"Y", "beta"}}, {{"Out", "shift_out"}}, - false); + + auto* x_div = + test::CreateOp(&m_prog, "elementwise_div", + {{"X", "x_sub_mean_out"}, {"Y", "std_dev_eps_sqrt_out"}}, + {{"Out", "division_out"}}, false); + x_div->SetAttr("axis", 1); + + auto* x_mul = test::CreateOp(&m_prog, "elementwise_mul", + {{"X", "division_out"}, {"Y", "gamma"}}, + {{"Out", "scale_out"}}, false); + x_mul->SetAttr("axis", 1); + + auto* x_add_v1 = test::CreateOp(&m_prog, "elementwise_add", + {{"X", "scale_out"}, {"Y", "beta"}}, + {{"Out", "shift_out"}}, false); + x_add_v1->SetAttr("axis", 1); } template diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc index c36123f65f6644289cfba2b2729862efa601e2fd..9542d3d3d43f311d4e4237e2efa41fe3f998603d 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc @@ -16,6 +16,7 @@ #include #include +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" @@ -26,6 +27,157 @@ namespace ir { class Node; +MapMatmul2MulPass::MapMatmul2MulPass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumGE(0.99f) + .IsNumLE(1.01f) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") + .IsBoolEQ(false) + .End(); + + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + +Flatten2MatmulFusePass::Flatten2MatmulFusePass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumGE(0.99f) + .IsNumLE(1.01f) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") + .IsBoolEQ(false) + .End(); + + AddOpCompat(OpCompat("flatten2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumGE(0) + .End(); + + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + +Squeeze2MatmulFusePass::Squeeze2MatmulFusePass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumGE(0.99f) + .IsNumLE(1.01f) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") + .IsBoolEQ(false) + .End(); + + AddOpCompat(OpCompat("Squeeze2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axes") + .IsType>() + .End(); + + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -39,6 +191,11 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { int found_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } + VLOG(4) << "map matmul to mul"; GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern); @@ -82,6 +239,11 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { IR_NODE_LINK_TO(mul_node, matmul_out); GraphSafeRemoveNodes(graph, {matmul_op}); ++found_count; + + if (!IsCompat(desc)) { + LOG(WARNING) << "MapMatmul2MulPass in out mul op compat failed."; + return; + } } }; @@ -103,6 +265,10 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { VLOG(4) << "fuse squeeze2+matmul to mul"; + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } GET_IR_NODE_FROM_SUBGRAPH(squeeze2_in_x, squeeze2_in_x, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(squeeze2_op, squeeze2_op, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, fuse_pattern); @@ -152,6 +318,10 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { IR_NODE_LINK_TO(mul_node, matmul_out); GraphSafeRemoveNodes(graph, {squeeze2_op, matmul_in_x, matmul_op}); ++found_count; + if (!IsCompat(desc)) { + LOG(WARNING) << "Squeeze2MatmulFusePass in out mul op compat failed."; + return; + } } }; @@ -159,6 +329,68 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_count); } +Reshape2MatmulFusePass::Reshape2MatmulFusePass() { + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("shape") // ints + .IsType>() + .End(); + + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumGT(0.99999f) + .IsNumLT(1.00001f) + .End() + .AddAttr("transpose_X") + .IsBoolEQ("False") + .End() + .AddAttr("transpose_Y") + .IsBoolEQ("False") + .End(); + + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -172,6 +404,10 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { int found_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "fuse reshape2+matmul to mul"; GET_IR_NODE_FROM_SUBGRAPH(reshape2_in_x, reshape2_in_x, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape2_op, reshape2_op, fuse_pattern); @@ -218,6 +454,10 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale")); desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale")); } + if (!IsCompat(desc)) { + LOG(WARNING) << "reshape2 matmul pass in out mul op compat failed."; + return; + } auto mul_node = g->CreateOpNode(&desc); IR_NODE_LINK_TO(reshape2_in_x, mul_node); IR_NODE_LINK_TO(matmul_in_y, mul_node); @@ -244,6 +484,11 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { int found_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } + VLOG(4) << "fuse flatten2+matmul to mul"; GET_IR_NODE_FROM_SUBGRAPH(flatten2_in_x, flatten2_in_x, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(flatten2_op, flatten2_op, fuse_pattern); @@ -301,6 +546,11 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { IR_NODE_LINK_TO(mul_node, matmul_out); GraphSafeRemoveNodes(graph, {flatten2_op, matmul_in_x, matmul_op}); ++found_count; + + if (!IsCompat(desc)) { + LOG(WARNING) << "Flatten2MatmulFusePass in out mul op compat failed."; + return; + } } }; diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h index 85067a6f642fe4637467541cd08f89bba3b397db..192dcfc00f9d34bf286b8ddebe355aa1b8d381be 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h @@ -39,6 +39,7 @@ class Graph; class MapMatmul2MulPass : public FusePassBase { public: + MapMatmul2MulPass(); virtual ~MapMatmul2MulPass() {} protected: @@ -66,6 +67,7 @@ class MapMatmul2MulPass : public FusePassBase { class Squeeze2MatmulFusePass : public FusePassBase { public: + Squeeze2MatmulFusePass(); virtual ~Squeeze2MatmulFusePass() {} protected: @@ -95,6 +97,7 @@ class Squeeze2MatmulFusePass : public FusePassBase { class Reshape2MatmulFusePass : public FusePassBase { public: + Reshape2MatmulFusePass(); virtual ~Reshape2MatmulFusePass() {} protected: @@ -103,6 +106,7 @@ class Reshape2MatmulFusePass : public FusePassBase { class Flatten2MatmulFusePass : public FusePassBase { public: + Flatten2MatmulFusePass(); virtual ~Flatten2MatmulFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt index a8c0973cac488ceb96249a898e819af7565c6c7a..5434678ccb04ac9a2a3b3e722d3f0c0f9b1ff5c3 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt @@ -15,4 +15,4 @@ cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_ cc_library(inplace_addto_op_pass SRCS inplace_addto_op_pass.cc DEPS memory_reuse_pass) -cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op) +cc_test(test_reference_count_pass_last_lived_ops SRCS test_reference_count_pass_last_lived_ops.cc DEPS parallel_executor elementwise_mul_op elementwise_add_op scale_op eigen_function) diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc index 7e28ccd24a80da738ec69f00efb5053dcdf1cde4..3fdb87f254403652a99983c29f9ba283a45eed2b 100644 --- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc @@ -29,6 +29,55 @@ void FuseBatchNormActOneDNNPass::ApplyImpl(Graph *graph) const { FuseBatchNormAct(graph, act_type); } +FuseBatchNormActOneDNNPass::FuseBatchNormActOneDNNPass() { + AddOpCompat(OpCompat("batch_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddInput("Mean") + .IsTensor() + .End() + .AddInput("Variance") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("MeanOut") + .IsOptional() + .End() + .AddOutput("VarianceOut") + .IsOptional() + .End() + .AddOutput("SavedMean") + .IsOptional() + .End() + .AddOutput("SavedVariance") + .IsOptional() + .End() + .AddOutput("ReserveSpace") + .IsOptional() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} + void FuseBatchNormActOneDNNPass::FuseBatchNormAct( Graph *graph, const std::string &act_type) const { PADDLE_ENFORCE_NOT_NULL( @@ -45,6 +94,11 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct( auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { VLOG(4) << "Fuse BatchNorm with ReLU activation op."; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } // BN output GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, bn_act_pattern); // ACT output @@ -84,6 +138,11 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct( bn_op->SetAttr("trainable_statistics", false); bn_op->SetOutput("Y", {act_out->Name()}); + if (!IsCompat(*bn_op)) { + LOG(WARNING) << "Fc fuse pass in out fc op compat failed."; + return; + } + IR_OP_VAR_LINK(batch_norm, act_out); GraphSafeRemoveNodes(g, {act, bn_out}); found_bn_act_count++; diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h index 843e7e420b7be07f7fd63d8a9a7d39791b206333..ba6a65bce8a8cc0822df07ddbdf104ae7c645be9 100644 --- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h @@ -31,6 +31,7 @@ namespace ir { */ class FuseBatchNormActOneDNNPass : public FusePassBase { public: + FuseBatchNormActOneDNNPass(); virtual ~FuseBatchNormActOneDNNPass() {} protected: diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc index 38364721f651527da1da8839d574c1bee136fa4f..e13d44ac23222187a82753a027dd3585f423800b 100644 --- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc @@ -32,6 +32,7 @@ void SetBatchNormAttrs(OpDesc* bn_op, bool is_test = true, bn_op->SetAttr("is_test", is_test); bn_op->SetAttr("trainable_statistics", trainable_stats); bn_op->SetAttr("fuse_with_relu", false); + bn_op->SetAttr("epsilon", 0.001f); } } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc index 7c749d9274299a2af3d7cbab98be5b362cabbc6e..aaae505edde385b5723bdcb1987805b4ce68a5be 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc @@ -49,6 +49,11 @@ void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { VLOG(4) << "handle " + conv_type() + "+" + activation_type() + " fuse"; + + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "conv_activation_mkldnn_fuse_pass op compat failed."; + return; + } GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, conv_activation_pattern); // Filter GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, @@ -97,6 +102,117 @@ void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_conv_activation_count); } +ConvActivationFusePass::ConvActivationFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsOptional() + .IsTensor() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + // IsStringIn({"EXPLICIT", "SAME", "VALID"}), MobileNetV2 has no this + // attribute + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + // IsStringIn({"NHWC", "NCHW"}) MobileNetV2 has no this attribute + .AddAttr("data_format") + .IsOptional() + .IsStringIn({"NHWC", "NCHW", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} +Conv2DLeakyReLUFusePass::Conv2DLeakyReLUFusePass() { + AddOpCompat(OpCompat("leaky_relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + // float, default=0.02 + .AddAttr("alpha") + .IsType() + .End(); +} +Conv2DReLU6FusePass::Conv2DReLU6FusePass() { + AddOpCompat(OpCompat("relu6")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + // default = 6.0f + .AddAttr("threshold") + .IsType() + .End(); +} +Conv2DSwishFusePass::Conv2DSwishFusePass() { + AddOpCompat(OpCompat("swish")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} +Conv2DHardSwishFusePass::Conv2DHardSwishFusePass() { + AddOpCompat(OpCompat("hard_swish")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + // float, optional, default=6.0 + .AddAttr("threshold") + .IsOptional() + .IsType() + .End() + // float, optional, default=6.0 + .AddAttr("scale") + .IsOptional() + .IsType() + .End() + // float, optional, default=3.0 + .AddAttr("offset") + .IsOptional() + .IsType() + .End(); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h index 2df27c420f6ecab56d5067ad0ef4a7f042f68a09..d22773fb41904afa17832224169f5430b94055c6 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h @@ -31,6 +31,7 @@ class Graph; class ConvActivationFusePass : public FusePassBase { public: + ConvActivationFusePass(); virtual ~ConvActivationFusePass() {} virtual std::string conv_type() const { return "conv2d"; } virtual std::string activation_type() const { return "relu"; } @@ -44,6 +45,7 @@ class ConvActivationFusePass : public FusePassBase { */ class Conv2DLeakyReLUFusePass : public ConvActivationFusePass { public: + Conv2DLeakyReLUFusePass(); std::string activation_type() const { return "leaky_relu"; } }; /* @@ -51,6 +53,7 @@ class Conv2DLeakyReLUFusePass : public ConvActivationFusePass { */ class Conv2DReLU6FusePass : public ConvActivationFusePass { public: + Conv2DReLU6FusePass(); std::string activation_type() const { return "relu6"; } }; /* @@ -58,6 +61,7 @@ class Conv2DReLU6FusePass : public ConvActivationFusePass { */ class Conv2DSwishFusePass : public ConvActivationFusePass { public: + Conv2DSwishFusePass(); std::string activation_type() const { return "swish"; } }; /* @@ -65,6 +69,7 @@ class Conv2DSwishFusePass : public ConvActivationFusePass { */ class Conv2DHardSwishFusePass : public ConvActivationFusePass { public: + Conv2DHardSwishFusePass(); std::string activation_type() const { return "hard_swish"; } }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc index 55bbad7a8875afc955af03ccecc796efa885e438..453197cda391542f41adcbeab55147b401d242f3 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h" #include +#include #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { @@ -30,9 +31,16 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("name", name); if (type == "conv2d") { op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("groups", 1); + op->SetAttr("padding_algorithm", std::string("EXPLICIT")); + op->SetAttr("data_format", std::string("NCHW")); + op->SetAttr("strides", std::vector({1, 1})); + op->SetAttr("dilations", std::vector({1, 1})); + op->SetAttr("paddings", std::vector({0, 0})); op->SetInput("Input", {inputs[0]}); op->SetInput("Filter", {inputs[1]}); op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Output", outputs); } else if (is_activation) { op->SetAttr("use_mkldnn", use_mkldnn); op->SetInput("X", inputs); @@ -43,8 +51,9 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, } else if (type == "swish") { op->SetAttr("beta", 1.0f); } + op->SetOutput("Out", outputs); } - op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kForward)); } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index c804eeb9fc362313d29534fd47346105f3954fd7..74bbe24eb82f5d3acd16ef6d51e71cdc77341544 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -25,6 +25,129 @@ namespace paddle { namespace framework { namespace ir { +ConvBiasFusePass::ConvBiasFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC"}) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 0}) + .End(); +} + +Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() { + AddOpCompat(OpCompat("conv2d_transpose")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("output_padding") + .IsType>() + .IsOptional() + .End() + .AddAttr("output_size") + .IsType>() + .IsOptional() + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); +} + +Conv3DBiasFusePass::Conv3DBiasFusePass() { + AddOpCompat(OpCompat("conv3d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC"}) + .End(); +} + template LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b, BinaryOperation f) { @@ -80,6 +203,12 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { subgraph.count(conv_input), 0, platform::errors::NotFound("Detector did not find conv input.")); + // check compat + if (!IsCompat(subgraph, g)) { + VLOG(3) << "Pass in op compat failed."; + return; + } + // check if fuse can be done and if MKL-DNN should be used FuseOptions fuse_option = FindFuseOption(*conv, *eltwise); if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h index 9a83310ebfb558f4744ae508155d8aa8d01a39c7..a74d7443ee1fe13212c6514d415a16d6f0cb2f5b 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h @@ -29,6 +29,7 @@ class Graph; class ConvBiasFusePass : public FusePassBase { public: + ConvBiasFusePass(); virtual ~ConvBiasFusePass() {} virtual std::string type() const { return "conv2d"; } @@ -41,11 +42,13 @@ class ConvBiasFusePass : public FusePassBase { */ class Conv2DTransposeBiasFusePass : public ConvBiasFusePass { public: + Conv2DTransposeBiasFusePass(); std::string type() const override { return "conv2d_transpose"; } }; class Conv3DBiasFusePass : public ConvBiasFusePass { public: + Conv3DBiasFusePass(); std::string type() const override { return "conv3d"; } }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc index 455350d2f703c52a9ef3e5714a60573408310080..80a9ef7eda724a49046f636f0617cbccf51c68a2 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc @@ -31,8 +31,19 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); if (type == "conv2d") { + const std::vector strides({1, 1}); + const std::vector paddings({0, 0}); + const std::vector dilations({1, 1}); op->SetAttr("use_mkldnn", true); op->SetAttr("name", name); + op->SetAttr("strides", strides); + op->SetAttr("groups", 1); + op->SetAttr("paddings", paddings); + op->SetAttr("padding_algorithm", std::string("EXPLICIT")); + op->SetAttr("dilations", dilations); + op->SetAttr("data_format", std::string("NCHW")); + + op->SetOutput("Output", outputs); op->SetInput("Input", {inputs[0]}); op->SetInput("Filter", {inputs[1]}); if (inputs.size() > 2) @@ -41,10 +52,11 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("Bias", {}); } else if (type == "elementwise_add") { op->SetAttr("use_mkldnn", true); + op->SetAttr("axis", -1); op->SetInput("X", {inputs[0]}); op->SetInput("Y", {inputs[1]}); + op->SetOutput("Out", outputs); } - op->SetOutput("Out", outputs); op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kForward)); } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc index c4d7a12037293e87b84b7395a9981d95fc2ee1e8..5fbfef08b7209bc695f90ff9188b8e9a7db029a7 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc @@ -23,7 +23,67 @@ namespace paddle { namespace framework { namespace ir { -class Graph; +ConvConcatReLUFusePass::ConvConcatReLUFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("concat")) + .AddInput("X") // Input("X"): vector + .End() + .AddInput("AxisTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumGE(0) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} void ConvConcatReLUFusePass::FindConcatWithConvs( ir::Graph* graph, diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h index f1faa84f3d59b736b35ee2c206976c899d3366bf..af372dbf97c672f33722b251d5e4a9168965d766 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h @@ -18,9 +18,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { @@ -31,10 +28,10 @@ namespace ir { * to a: * (multi ConvReLU) -> Concat -> next_op. */ -class Graph; class ConvConcatReLUFusePass : public FusePassBase { public: + ConvConcatReLUFusePass(); virtual ~ConvConcatReLUFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index fa1544f780ac1a549fa2119d552aa844345abfe7..bd65ad8e6437855dd97c70fe92aa27f4fc839a09 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -81,16 +81,72 @@ boost::optional HasAttribute(const Node& op, const std::string& attr) { return boost::none; } +ResidualConnectionMKLDNNFusePass::ResidualConnectionMKLDNNFusePass() { + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 0}) + .End(); +} + ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, const ResidualConnectionMKLDNNFusePass::IdentityConvFunc& get_node_from_conv_op, const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc& - get_node_from_elementwise_add_op) + get_node_from_elementwise_add_op, + const ResidualConnectionMKLDNNFusePass* pass) : fusion_stats{std::make_shared(0)}, can_fuse_func{can_fuse_func}, get_node_from_conv_op{get_node_from_conv_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {} + get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, + pass_{pass} {} void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { @@ -102,6 +158,11 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( Node* elementwise_add_op; Node* elementwise_add_identity; Node* elementwise_add_out; + if (!pass_->IsCompat(subgraph, graph)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } std::tie(conv_op, conv_input, conv_filter, conv_output) = get_node_from_conv_op(subgraph); @@ -133,12 +194,14 @@ ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle( const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& get_node_from_conv_y_op, const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc& - get_node_from_elementwise_add_op) + get_node_from_elementwise_add_op, + const ResidualConnectionMKLDNNFusePass* pass) : fusion_stats{std::make_shared(0)}, can_fuse_func{can_fuse_func}, get_node_from_conv_x_op{get_node_from_conv_x_op}, get_node_from_conv_y_op{get_node_from_conv_y_op}, - get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {} + get_node_from_elementwise_add_op{get_node_from_elementwise_add_op}, + pass_{pass} {} void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { @@ -155,6 +218,12 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( Node* elementwise_add_op; Node* elementwise_add_out; + if (!pass_->IsCompat(subgraph, graph)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) = get_node_from_conv_x_op(subgraph); std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) = @@ -247,7 +316,7 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { return GetNodesFromConv(conv_pattern, subgraph); }, - get_node_from_elementwise_add); + get_node_from_elementwise_add, this); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( @@ -284,7 +353,7 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { return GetNodesFromConv(conv_pattern, subgraph); }, - get_node_from_elementwise_add); + get_node_from_elementwise_add, this); } GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( @@ -325,7 +394,7 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) { return GetNodesFromConv(conv_y_pattern, subgraph); }, - get_node_from_elementwise_add); + get_node_from_elementwise_add, this); } void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index 2ba4c80678f0890b05c6d4c9822d8c5c9a032dc4..5b4f941836ce0b4410f004600a258c88ed5c22ac 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -84,7 +84,6 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { auto can_fuse = [this](Node* op1, Node* op2) -> bool { return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN; }; - auto fuse_handle = HandleType{can_fuse, std::forward(op_funcs)...}; (*gpd)(graph, fuse_handle); @@ -96,7 +95,8 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { IdentityFuseHandle( const CanFuseFunc& can_fuse_func, const IdentityConvFunc& get_node_from_conv_op, - const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op); + const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op, + const ResidualConnectionMKLDNNFusePass* pass); void operator()(const GraphPatternDetector::subgraph_t& subgraph, Graph* graph); @@ -107,6 +107,7 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { CanFuseFunc can_fuse_func; IdentityConvFunc get_node_from_conv_op; IdentityElementwiseAddFunc get_node_from_elementwise_add_op; + const ResidualConnectionMKLDNNFusePass* pass_; }; struct ProjectionFuseHandle { @@ -114,7 +115,8 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { const CanFuseFunc& can_fuse_func, const ProjectionConvFunc& get_node_from_conv_x_op, const ProjectionConvFunc& get_node_from_conv_y_op, - const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op); + const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op, + const ResidualConnectionMKLDNNFusePass* pass); void operator()(const GraphPatternDetector::subgraph_t& subgraph, Graph* graph); @@ -126,9 +128,11 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { ProjectionConvFunc get_node_from_conv_x_op; ProjectionConvFunc get_node_from_conv_y_op; ProjectionElementwiseAddFunc get_node_from_elementwise_add_op; + const ResidualConnectionMKLDNNFusePass* pass_; }; public: + ResidualConnectionMKLDNNFusePass(); virtual ~ResidualConnectionMKLDNNFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc index eafc81cc81d440a976e0176a93ff563972a1d5c9..c86c6350a16263f64554ce875c7c628760d87313 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_test_util.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -25,16 +26,67 @@ namespace ir { constexpr int nodes_removed = 3; constexpr int nodes_added = 1; +OpDesc* Create_Op_con2d(ProgramDesc* prog, const std::string& op_type_name, + const std::vector& inputs, + const std::vector& outputs, + const bool use_mkldnn = true) { + auto* op = prog->MutableBlock(0)->AppendOp(); + const std::vector strides({1, 1}); + const std::vector paddings({0, 0}); + const std::vector dilations({1, 1}); + op->SetType(op_type_name); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("strides", strides); + op->SetAttr("groups", 1); + op->SetAttr("paddings", paddings); + op->SetAttr("padding_algorithm", std::string("EXPLICIT")); + op->SetAttr("dilations", dilations); + op->SetAttr("data_format", std::string("NCHW")); + + for (const auto& input : inputs) { + op->SetInput(input.first, {input.second}); + } + for (const auto& output : outputs) { + op->SetOutput(output.first, {output.second}); + } + + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return op; +} + +OpDesc* Create_Op_elemntwise_add( + ProgramDesc* prog, const std::string& op_type_name, + const std::vector& inputs, + const std::vector& outputs, + bool use_mkldnn = true) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(op_type_name); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("axis", -1); + + for (const auto& input : inputs) { + op->SetInput(input.first, {input.second}); + } + for (const auto& output : outputs) { + op->SetOutput(output.first, {output.second}); + } + + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return op; +} + TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) { auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"}); test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); - test::CreateOp(&prog, "conv2d", - {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, - {{"Output", "c"}}); - test::CreateOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, - {{"Out", "d"}}); + Create_Op_con2d(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {{"Output", "c"}}); + Create_Op_elemntwise_add(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, + {{"Out", "d"}}); test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}}); Graph graph(prog); @@ -53,17 +105,17 @@ TEST(ConvElementwiseAddMKLDNNFusePass, test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); // right branch - test::CreateOp(&prog, "conv2d", - {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, - {{"Output", "c"}}); + Create_Op_con2d(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {{"Output", "c"}}); // left branch - test::CreateOp(&prog, "conv2d", - {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}}, - {{"Output", "f"}}); + Create_Op_con2d(&prog, "conv2d", + {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}}, + {{"Output", "f"}}); - test::CreateOp(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, - {{"Out", "d"}}); + Create_Op_elemntwise_add(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, + {{"Out", "d"}}); test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}}); Graph graph(prog); @@ -80,10 +132,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"}); test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); - test::CreateOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, - {{"Output", "c"}}); - test::CreateOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, - {{"Out", "d"}}); + Create_Op_con2d(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, + {{"Output", "c"}}); + Create_Op_elemntwise_add(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, + {{"Out", "d"}}); test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}}); Graph graph(prog); @@ -100,12 +152,12 @@ TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) { test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"}); test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); - test::CreateOp(&prog, "conv2d", - {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, - {{"Output", "c"}}); + Create_Op_con2d(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {{"Output", "c"}}); - test::CreateOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, - {{"Out", "d"}}); + Create_Op_elemntwise_add(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, + {{"Out", "d"}}); test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}}); Graph graph(prog); @@ -122,10 +174,10 @@ TEST(ConvElementwiseAddMKLDNNFusePass, auto prog = test::BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"}); test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); - test::CreateOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, - {{"Output", "c"}}); - test::CreateOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, - {{"Out", "d"}}); + Create_Op_con2d(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, + {{"Output", "c"}}); + Create_Op_elemntwise_add(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, + {{"Out", "d"}}); test::CreateOp(&prog, "relu", {{"X", "d"}}, {{"Out", "e"}}); Graph graph(prog); @@ -142,14 +194,14 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) { test::BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"}); test::CreateOp(&prog, "sigmoid", {{"X", "a"}}, {{"Out", "b"}}); - test::CreateOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, - {{"Output", "c"}}); + Create_Op_con2d(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, + {{"Output", "c"}}); - test::CreateOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}}, - {{"Output", "e"}}); + Create_Op_con2d(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}}, + {{"Output", "e"}}); - test::CreateOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, - {{"Out", "f"}}); + Create_Op_elemntwise_add(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, + {{"Out", "f"}}); test::CreateOp(&prog, "relu", {{"X", "f"}}, {{"Out", "g"}}); Graph graph(prog); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index 34668192f0bdd00ce1a6db50e2f790c288a15f63..2483a506a8f934f8ad5837f297e019c5ad5932e2 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -25,10 +25,62 @@ namespace paddle { namespace framework { namespace ir { -class Graph; - using string::PrettyLogDetail; +CPUQuantizeSquashPass::CPUQuantizeSquashPass() { + AddOpCompat(OpCompat("scale")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("bias") + .IsNumEQ(0.0f) + .End() + .AddAttr("scale") + .IsNumGT(0.0f) + .End() + .AddAttr("bias_after_scale") // bias equal to 0.0, so this attribute is + // unconstrained. + .End(); + + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .End() + .AddAttr("paddings") + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .End() + .AddAttr("data_format") + .IsOptional() + .IsStringIn({"NCHW", "NHWC"}) + .End(); +} + void CPUQuantizeSquashPass::FindNodesToKeep( Graph* graph, std::unordered_map* nodes_keep_counter) const { @@ -354,6 +406,10 @@ void CPUQuantizeSquashPass::DequantScaleSquash(Graph* graph) const { int found_dequant_scale_squash_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "squash dequant-scale ops pair"; GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, dequant_scale_pattern); @@ -362,9 +418,10 @@ void CPUQuantizeSquashPass::DequantScaleSquash(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, dequant_scale_pattern); if (dequant_out->outputs.size() == 1 && - scale_op->Op()->GetAttrIfExists("bias") == 0.0) { + BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias")) == 0.0f) { auto dequant_scale = dequant_op->Op()->GetAttrIfExists("Scale"); - auto scale_scale = scale_op->Op()->GetAttrIfExists("scale"); + float scale_scale = + BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale")); PADDLE_ENFORCE_GT(dequant_scale, 0.0f, platform::errors::InvalidArgument( @@ -399,6 +456,10 @@ void CPUQuantizeSquashPass::ScaleQuantSquash(Graph* graph) const { int found_scale_quant_squash_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "squash scale-quant ops pair"; GET_IR_NODE_FROM_SUBGRAPH(scale_in, scale_in, scale_quant_pattern); @@ -407,9 +468,10 @@ void CPUQuantizeSquashPass::ScaleQuantSquash(Graph* graph) const { GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, scale_quant_pattern); if (quant_in->outputs.size() == 1 && - scale_op->Op()->GetAttrIfExists("bias") == 0.0) { + BOOST_GET_CONST(float, scale_op->Op()->GetAttr("bias")) == 0.0f) { auto quant_scale = quant_op->Op()->GetAttrIfExists("Scale"); - auto scale_scale = scale_op->Op()->GetAttrIfExists("scale"); + float scale_scale = + BOOST_GET_CONST(float, scale_op->Op()->GetAttr("scale")); PADDLE_ENFORCE_GT( quant_scale, 0.0f, @@ -443,6 +505,11 @@ void CPUQuantizeSquashPass::QuantizeBf16Conv(Graph* graph) const { int found_quant_conv_squash_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } + VLOG(4) << "squash quant-conv2d ops pair"; GET_IR_NODE_FROM_SUBGRAPH(quant_in, quant_in, pattern); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h index b34d5062e3eed4adaa2cc139c0842ffd9e3ddb82..abd0f741b76317fba96748a2ed0b2182b59696bb 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h @@ -19,9 +19,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { @@ -30,10 +27,10 @@ namespace ir { /* * Squash dequantize->quantize pair pattern into requantize op */ -class Graph; class CPUQuantizeSquashPass : public FusePassBase { public: + CPUQuantizeSquashPass(); virtual ~CPUQuantizeSquashPass() {} protected: diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc index 08e2041a9a1e77151d4f71a80054a3bb806a2e07..f1352ebaad6d8df6e0d535a364f83e3b55cb9f93 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -25,7 +25,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, const std::vector& inputs, const std::vector& outputs, bool use_mkldnn, const std::vector scale = {}, float bias = 0.0, - const std::string& mkldnn_data_type = "float32") { + const std::string& mkldnn_data_type = "float32", + bool bias_after_scale = false, int groups = 1) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); op->SetAttr("use_mkldnn", use_mkldnn); @@ -37,6 +38,15 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]}); if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); op->SetOutput("Output", {outputs[0]}); + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + op->SetAttr("strides", strides); + op->SetAttr("paddings", paddings); + op->SetAttr("dilations", dilations); + op->SetAttr("groups", groups); + op->SetAttr("padding_algorithm", std::string("EXPLICIT")); + op->SetAttr("data_format", std::string("NCHW")); op->SetAttr("force_fp32_output", false); op->SetAttr("mkldnn_data_type", mkldnn_data_type); } else if (type == "quantize") { @@ -74,6 +84,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetOutput("Out", {outputs[0]}); op->SetAttr("scale", scale[0]); op->SetAttr("bias", bias); + op->SetAttr("bias_after_scale", bias_after_scale); } else if (type == "matmul") { op->SetInput("X", {inputs[0]}); op->SetInput("Y", {inputs[1]}); @@ -373,8 +384,8 @@ ProgramDesc BuildQuantConv2dProgramDesc(const bool& use_mkldnn, prog.MutableBlock(0)->Var(v); } SetOp(&prog, "quantize", "Quant", {"a"}, {"b"}, use_mkldnn, {quant_scale}); - SetOp(&prog, "conv2d", "Conv2d", {"b"}, {"c"}, use_mkldnn, {}, 0.0f, - mkldnn_data_type); + SetOp(&prog, "conv2d", "Conv2d", {"b", "filter", "bias"}, {"c"}, use_mkldnn, + {}, 0.0f, mkldnn_data_type); return prog; } diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc index 39f47406a77ca9e11f588029678d1ca6c1e48372..039094c27093352be760eaf5ee4f712fdea355c7 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc @@ -31,6 +31,47 @@ class Graph; PADDLE_ENFORCE_NOT_NULL( \ id, platform::errors::InvalidArgument("Subgraph has no node %s.", #id)); +DepthwiseConvMKLDNNPass::DepthwiseConvMKLDNNPass() { + AddOpCompat(OpCompat("depthwise_conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsOptional() + .IsTensor() + .End() + .AddInput("ResidualData") + .IsOptional() + .IsTensor() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + // mobilenet-ssd has no "padding_algorithm" + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NHWC", "NCHW", "AnyLayout"}) + .End(); +} + void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -45,6 +86,10 @@ void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const { int found_depthwise_conv_mkldnn_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass op compat failed."; + return; + } VLOG(3) << "handle DepthwiseConvMKLDNN fuse"; GET_NODE(depthwise_conv, (*pattern)); depthwise_conv->Op()->SetType("conv2d"); diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h index 0f4ecc71ad72020b089821a0cadc4156718230e8..06ce5a41b6c4233a1b3469023727346c5efa7bea 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h @@ -24,6 +24,7 @@ class Graph; class DepthwiseConvMKLDNNPass : public FusePassBase { public: + DepthwiseConvMKLDNNPass(); virtual ~DepthwiseConvMKLDNNPass() {} protected: diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc index c6c72ba33d6295d90c502ab88d7d712d76a11aad..06940b38ea8e005c59c3c2604f6a6bb822b84511 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc @@ -29,10 +29,16 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetType(type); op->SetAttr("use_mkldnn", use_mkldnn); op->SetAttr("name", name); + op->SetAttr("groups", 1); + op->SetAttr("padding_algorithm", std::string("EXPLICIT")); + op->SetAttr("data_format", std::string("NCHW")); + op->SetAttr("strides", std::vector({1, 1})); + op->SetAttr("dilations", std::vector({1, 1})); + op->SetAttr("paddings", std::vector({0, 0})); op->SetInput("Input", {inputs[0]}); op->SetInput("Filter", {inputs[1]}); op->SetInput("Bias", {inputs[2]}); - op->SetOutput("Out", outputs); + op->SetOutput("Output", outputs); } // (a, weights, bias)->depthwise conv mkldnn->b diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc index fbc97a0a929c48c4eba3baa881061654dd802b62..e5bdb08fe4ab4825aef1d3d3ccd7d3a7f352574e 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc @@ -22,6 +22,63 @@ namespace paddle { namespace framework { namespace ir { +MatmulTransposeReshapeMKLDNNPass::MatmulTransposeReshapeMKLDNNPass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") // unconstrained. can be any float value. + .IsType() + .End() + .AddAttr("transpose_X") // unconstrained. can be any bool value. + .IsType() + .End() + .AddAttr("transpose_Y") // unconstrained. can be any bool value. + .IsType() + .End(); + + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") // ints + .IsType>() + .End(); + + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("shape") // ints + .IsType>() + .End(); +} void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { PADDLE_ENFORCE_NOT_NULL(graph, platform::errors::InvalidArgument( @@ -37,6 +94,10 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { int found_matmul_transpose_reshape_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "handle matmul_transpose_reshape fuse"; GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, mtrp); GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, mtrp); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h index ef469bac40c4edbc524ef4b24c8df932819f0a3a..09cbe9bdf7b2fb5c8fd0c8676730031482f3d6d9 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h @@ -17,8 +17,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -27,6 +25,7 @@ class Graph; class MatmulTransposeReshapeMKLDNNPass : public FusePassBase { public: + MatmulTransposeReshapeMKLDNNPass(); virtual ~MatmulTransposeReshapeMKLDNNPass() {} protected: diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc index 122a7f802a52972612e2879eaea29d14e5d7c561..d98d640e1002b1ff97e9d03a44a866987e3a2af8 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc @@ -38,6 +38,9 @@ void SetOp(ProgramDesc *prog, const std::string &type, if (type == "matmul") { op->SetInput("Y", {inputs[1]}); op->SetAttr("use_mkldnn", true); + op->SetAttr("alpha", 1.0f); + op->SetAttr("transpose_X", true); + op->SetAttr("transpose_Y", true); } } diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..c4770a322db50c495f9d47aba3d338615fa36219 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_fc_rnn_fuse_pass_tester.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/fc_gru_fuse_pass_tester.h" +#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass_tester.h" +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void TestFcRNNFusePass(const std::string& pass_name, + std::string activation = "tanh", + std::string gate_activation = "sigmoid", + std::string candidate_activation = "tanh") { + std::unique_ptr graph = + (pass_name == "fc_gru_fuse_pass" + ? fc_gru_test::PrepareGraph(activation, gate_activation) + : fc_lstm_test::PrepareGraph(gate_activation, activation, + candidate_activation)); + auto mkldnn_placement_pass_ = + PassRegistry::Instance().Get("mkldnn_placement_pass"); + mkldnn_placement_pass_->Set("mkldnn_enabled_op_types", + new std::unordered_set({})); + graph->Set("__param_scope__", (pass_name == "fc_gru_fuse_pass" + ? fc_gru_test::CreateParamScope() + : fc_lstm_test::CreateParamScope())); + graph.reset(mkldnn_placement_pass_->Apply(graph.release())); + + auto check_num_mkldnn_nodes = [&](const std::unique_ptr& graph) { + int nodes_cout = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->GetAttrIfExists("use_mkldnn")) nodes_cout++; + } + } + return nodes_cout; + }; + int num_mkldnn_nodes_before = check_num_mkldnn_nodes(graph); + int removed_mkldnn_nodes = 2; + + // OneDNN fusion_gru and fusion_lstm supports only sigmoid as a gate + // activation and tanh as an activation and candidate_activation + if (activation != "tanh" || gate_activation != "sigmoid" || + candidate_activation != "tanh") + removed_mkldnn_nodes += 2; + + auto fc_rnn_fuse_pass_ = PassRegistry::Instance().Get(pass_name); + graph.reset(fc_rnn_fuse_pass_->Apply(graph.release())); + int num_mkldnn_nodes_after = check_num_mkldnn_nodes(graph); + + PADDLE_ENFORCE_EQ(num_mkldnn_nodes_before - removed_mkldnn_nodes, + num_mkldnn_nodes_after, + platform::errors::PreconditionNotMet( + "The number of nodes with \"use_mkldnn\" attr after " + "passes is not as expected")); +} + +TEST(FcGruFusePass, use_mkldnn) { TestFcRNNFusePass("fc_gru_fuse_pass"); } + +TEST(FcGruFusePass, gru_unsupported_activations) { + TestFcRNNFusePass("fc_gru_fuse_pass", "relu", "sigmoid"); +} + +TEST(FcLstmFusePass, use_mkldnn) { TestFcRNNFusePass("fc_lstm_fuse_pass"); } + +TEST(FcLstmFusePass, lstm_unsupported_activations) { + TestFcRNNFusePass("fc_lstm_fuse_pass", "tanh", "relu", "tanh"); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(mkldnn_placement_pass); +USE_PASS(fc_gru_fuse_pass); +USE_PASS(fc_lstm_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index 01abe5a8d281b6f6d0bd2bba30dde01877926a39..90dc7801131074868073e1307ae7bfc51f2c3631 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -167,7 +167,7 @@ TEST(MKLDNNInplacePass, inplace_softmax_branched) { TEST(MKLDNNInplacePass, inplace_elementwise_add) { // Two elementwise_add mkl-dnn enabled op instances to be made inplace - MKLDNNInplacePassTest().MainTest("elementwise_add", false, 1); + MKLDNNInplacePassTest().MainTest("elementwise_add", false, 0); } TEST(MKLDNNInplacePass, inplace_tanh) { MKLDNNInplacePassTest().MainTest("tanh", false, 1); diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc index b4c53ec5f91ccb855d176f84cd12378d2ec66e26..26692849d977b5bc0e3dabbd35b7f8fa53832978 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc @@ -23,6 +23,59 @@ namespace paddle { namespace framework { namespace ir { +ReshapeTransposeMatmulMkldnnFusePass::ReshapeTransposeMatmulMkldnnFusePass() { + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + // The reshape2 op for this pass should not have "Shape" and "ShapeTensor" + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("shape") + .IsType>() + .End(); + + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("axis") + .IsType>() + .End(); + + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() + .End() + .AddAttr("transpose_X") + .IsType() + .End() + .AddAttr("transpose_Y") + .IsType() + .End(); +} + void ReshapeTransposeMatmulMkldnnFusePass::Fuse( Graph *graph, bool with_reshape_xshape, bool with_transpose_xshape) const { GraphPatternDetector gpd; @@ -34,6 +87,11 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse( int found_reshape_transpose_matmul_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Op compatible check in " + "reshape_transpose_matmul_mkldnn_fuse_pass failed."; + return; + } VLOG(4) << "handle ReshapeTransposeMatmulMkldnn fuse"; GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, rtm_pattern); GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, rtm_pattern); diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h index 7a53b3c498413e43eea7b2e4697791d36fed1149..4637d0659af8c562440c280efb158f0fcde93f24 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.h @@ -17,8 +17,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -26,11 +24,10 @@ namespace ir { /* * Fuse Reshape->Transpose->MatMul when MatMul uses mkldnn. */ -class Graph; class ReshapeTransposeMatmulMkldnnFusePass : public FusePassBase { public: - virtual ~ReshapeTransposeMatmulMkldnnFusePass() {} + ReshapeTransposeMatmulMkldnnFusePass(); protected: void ApplyImpl(ir::Graph* graph) const override; diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc index a552e42619f368c2e8e2a51213ac10d9317151cf..13f1fa50d080a33d837ebb63984cd4e5c3c1c350 100644 --- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc @@ -28,6 +28,45 @@ namespace ir { class Graph; using string::PrettyLogDetail; +ScaleMatmulFusePass::ScaleMatmulFusePass() { + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumGT(0.0f) + .End() + .AddAttr("transpose_X") + .IsType() + .End() + .AddAttr("transpose_Y") + .IsType() + .End(); + + AddOpCompat(OpCompat("scale")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("scale") + .IsNumGT(0.0f) + .End() + .AddAttr("bias") + .IsNumEQ(0.0f) + .End() + .AddAttr("bias_after_scale") + .IsOptional() + .IsType() + .End(); +} void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL(graph, @@ -43,6 +82,10 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const { int found_scale_matmul_fuse_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } GET_IR_NODE_FROM_SUBGRAPH(scale_in, scale_in, scale_matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(scale_op, scale_op, scale_matmul_pattern); GET_IR_NODE_FROM_SUBGRAPH(scale_out, scale_out, scale_matmul_pattern); @@ -75,6 +118,11 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const { matmul_op->Op()->SetInput(matmul_op_input_name, std::vector({scale_in->Name()})); IR_NODE_LINK_TO(scale_in, matmul_op); + + if (!IsCompat(*matmul_op->Op())) { + LOG(WARNING) << "scale_matmul_fuse_pass in out fc op compat failed."; + return; + } GraphSafeRemoveNodes(graph, {scale_op, scale_out}); found_scale_matmul_fuse_count++; } diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h index 32ff78d9a73683c700ceb31a1505538ff7ee6119..acea8ba563dc05ae1fb7b63afa0479cc27f74a31 100644 --- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.h @@ -24,6 +24,7 @@ class Graph; class ScaleMatmulFusePass : public FusePassBase { public: + ScaleMatmulFusePass(); virtual ~ScaleMatmulFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc index d37d014a87b66076ec94ad69b381c6a73c7bca19..60f844ffc80cea2bd1fefca31435575936f5bdf5 100644 --- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass_tester.cc @@ -31,6 +31,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, op->SetAttr("scale", scale); op->SetAttr("bias", bias); } else if (type == "matmul") { + op->SetAttr("transpose_X", false); + op->SetAttr("transpose_Y", false); op->SetInput("X", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Y", {inputs[1]}); op->SetAttr("alpha", scale); diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index 1e8349e878781dccc622580f5e80b803e2194dee..5a97727da3b456981d5fbef8fda053695c3bfc27 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -422,13 +422,335 @@ PDNode* MultiHeadMatmulPattern::operator()() { return transpose2_2_out_var; } -static int BuildFusionV2(Graph* graph, const std::string& name_scope, - Scope* scope) { +PDNode* MultiHeadMatmulV3Pattern::operator()() { + std::unordered_set matmul_ops{"matmul", "matmul_v2"}; + auto* input0 = pattern->NewNode(input0_repr()); + input0->assert_is_op_input("matmul"); + + // First path with scale + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul"); + auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul0_out_var = + pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul"); + + decltype(mul0) eltadd0; + decltype(mul0) eltadd0_b_var; + decltype(mul0) eltadd0_out_var; + + mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add"); + eltadd0_b_var = pattern->NewNode(eltadd0_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd0_out_var = pattern->NewNode(eltadd0_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_0 = + pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2"); + + auto* reshape2_0_out_var = + pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2"); + reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_0 = + pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); + auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul", "X"); + + auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); + auto* matmul_qk_out_var = + pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); + matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto* eltadd_qk = + pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add"); + auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax"); + + auto* softmax_qk = + pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax"); + auto* softmax_qk_out_var = + pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax"); + softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops); + + auto* matmul_qkv = + pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops); + auto* matmul_qkv_out_var = + pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops); + matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_qkv = + pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2"); + auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_qkv = + pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); + auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) + ->assert_is_op_output("reshape2"); + reshape2_qkv_out_var->assert_is_op_input("matmul"); + + // Second path to matmul + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul"); + auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul1_out_var = + pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul"); + + decltype(mul1) eltadd1; + decltype(mul1) eltadd1_b_var; + decltype(mul1) eltadd1_out_var; + + mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add"); + eltadd1_b_var = pattern->NewNode(eltadd1_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd1_out_var = pattern->NewNode(eltadd1_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_1 = + pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2"); + + auto* reshape2_1_out_var = + pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2"); + reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_1 = + pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); + auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_1_out_var->AsIntermediate()->assert_is_op_input( + "matmul", "Y"); // link to matmul qk + + // Third path to matmul + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); + auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul2_out_var = + pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul"); + + decltype(mul2) eltadd2; + decltype(mul2) eltadd2_b_var; + decltype(mul2) eltadd2_out_var; + + mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add"); + eltadd2_b_var = pattern->NewNode(eltadd2_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd2_out_var = pattern->NewNode(eltadd2_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_2 = + pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2"); + + auto* reshape2_2_out_var = + pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2"); + reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_2 = + pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2"); + auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_2_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops); // link to matmul qkv + + // Q path + mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var}); + eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var}); + + reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var}); + transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var}); + // K path + mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var}); + eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var}); + reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var}); + transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var}); + // compute q*k + matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var}) + .LinksTo({matmul_qk_out_var}); + eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) + .LinksTo({eltadd_qk_out_var}); + softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); + // V path + mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var}); + eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var}); + reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var}); + transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var}); + // compute q*k*v + matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) + .LinksTo({matmul_qkv_out_var}); + transpose2_qkv->LinksFrom({matmul_qkv_out_var}) + .LinksTo({transpose2_qkv_out_var}); + reshape2_qkv->LinksFrom({transpose2_qkv_out_var}) + .LinksTo({reshape2_qkv_out_var}); + + return transpose2_2_out_var; +} +} // namespace patterns + +void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = patterns::BuildFusion(graph, name_scope_); + AddStatis(fusion_count); +} + +MultiHeadMatmulV2FusePass::MultiHeadMatmulV2FusePass() { + AddOpCompat(OpCompat("mul")) + .AddInput("X") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddInput("Y") // the shape shoule be (N*H, N*H) + .IsTensor() + .End() + .AddOutput("Out") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(2) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + // in bias, shape is (B, S, N*H), + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + .AddInput("Y") + // in bias, shape is (N*H) + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + // in bias, shape is (B, S, N*H) + // in biasqk, shape is (B, H, S, S) + .AddOutput("Out") + .IsTensor() + .End() + // in bias, it equal to 2 + // in biasqk, it equal to -1 or 0 + .AddAttr("axis") + .IsIntIn({2, -1, 0}) + .End(); + + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("shape") // -->(B, S, H, N) <--(B, S, N*H) + .IsType>() + .End(); + + // -->: (B, S, H, N) -> (B, H, S, N) + // <--: (B, H, S, N) -> (B, S, H, N) + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") // {0, 2, 1, 3} + .IsType>() + .End(); + + AddOpCompat(OpCompat("scale")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("scale") + .IsType() // copy to new op. so unconstrained. + .End() + .AddAttr("bias") + .IsNumEQ(0.f) + .End() + .AddAttr("bias_after_scale") // bias is 0, so unconstrained. + .IsType() + .End(); + + // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S) + // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N) + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumEQ(1.0f) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") // QK(true) QKV(false) + .IsType() + .End(); + + AddOpCompat(OpCompat("softmax")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 3}) // shape is (B, H, S, S), so axis is -1 or 3 + .End(); +} + +int MultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph, + const std::string& name_scope, + Scope* scope) const { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); // Create pattern. - MultiHeadMatmulPattern multihead_pattern(pattern, name_scope); + patterns::MultiHeadMatmulPattern multihead_pattern(pattern, name_scope); multihead_pattern(); // Create New OpDesc @@ -580,6 +902,11 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, int fusion_count{0}; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) + << "Op compat check in multihead_matmul_fuse_pass_v2 failed."; + return; + } // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern); GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern); @@ -714,197 +1041,141 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, return fusion_count; } -PDNode* MultiHeadMatmulV3Pattern::operator()() { - std::unordered_set matmul_ops{"matmul", "matmul_v2"}; - auto* input0 = pattern->NewNode(input0_repr()); - input0->assert_is_op_input("matmul"); - - // First path with scale - auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul"); - auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) - ->AsInput() - ->assert_is_op_input("matmul", "Y"); - auto* mul0_out_var = - pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul"); - - decltype(mul0) eltadd0; - decltype(mul0) eltadd0_b_var; - decltype(mul0) eltadd0_out_var; - - mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); - - eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add"); - eltadd0_b_var = pattern->NewNode(eltadd0_b_repr()) - ->AsInput() - ->assert_is_op_input("elementwise_add", "Y"); - - eltadd0_out_var = pattern->NewNode(eltadd0_out_repr()) - ->assert_is_op_output("elementwise_add"); - eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2"); - - auto* reshape2_0 = - pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2"); - - auto* reshape2_0_out_var = - pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2"); - reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2"); - - auto* transpose2_0 = - pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); - auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) - ->assert_is_op_output("transpose2"); - transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul"); - - auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); - auto* matmul_qk_out_var = - pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); - matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); - - auto* eltadd_qk = - pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add"); - auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr()) - ->AsInput() - ->assert_is_op_input("elementwise_add", "Y"); - auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr()) - ->assert_is_op_output("elementwise_add"); - eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax"); - - auto* softmax_qk = - pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax"); - auto* softmax_qk_out_var = - pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax"); - softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops); - - auto* matmul_qkv = - pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops); - auto* matmul_qkv_out_var = - pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops); - matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2"); - - auto* transpose2_qkv = - pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2"); - auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr()) - ->assert_is_op_output("transpose2"); - transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2"); - - auto* reshape2_qkv = - pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); - auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) - ->assert_is_op_output("reshape2"); - reshape2_qkv_out_var->assert_is_op_input("matmul"); - - // Second path to matmul - auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul"); - auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) - ->AsInput() - ->assert_is_op_input("matmul", "Y"); - auto* mul1_out_var = - pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul"); - - decltype(mul1) eltadd1; - decltype(mul1) eltadd1_b_var; - decltype(mul1) eltadd1_out_var; - - mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); - eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add"); - eltadd1_b_var = pattern->NewNode(eltadd1_b_repr()) - ->AsInput() - ->assert_is_op_input("elementwise_add", "Y"); - - eltadd1_out_var = pattern->NewNode(eltadd1_out_repr()) - ->assert_is_op_output("elementwise_add"); - eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2"); - - auto* reshape2_1 = - pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2"); - - auto* reshape2_1_out_var = - pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2"); - reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2"); - - auto* transpose2_1 = - pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); - auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) - ->assert_is_op_output("transpose2"); - transpose2_1_out_var->AsIntermediate()->assert_is_op_input( - "matmul"); // link to matmul qk - - // Third path to matmul - auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); - auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) - ->AsInput() - ->assert_is_op_input("matmul", "Y"); - auto* mul2_out_var = - pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul"); - - decltype(mul2) eltadd2; - decltype(mul2) eltadd2_b_var; - decltype(mul2) eltadd2_out_var; - - mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); - eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add"); - eltadd2_b_var = pattern->NewNode(eltadd2_b_repr()) - ->AsInput() - ->assert_is_op_input("elementwise_add", "Y"); - - eltadd2_out_var = pattern->NewNode(eltadd2_out_repr()) - ->assert_is_op_output("elementwise_add"); - eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2"); - - auto* reshape2_2 = - pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2"); - - auto* reshape2_2_out_var = - pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2"); - reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2"); - - auto* transpose2_2 = - pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2"); - auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr()) - ->assert_is_op_output("transpose2"); - transpose2_2_out_var->AsIntermediate()->assert_is_ops_input( - matmul_ops); // link to matmul qkv - - // Q path - mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var}); - eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var}); +void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, + platform::errors::Fatal( + "During the multiheadMatmul pass, The scope should not be null.")); - reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var}); - transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var}); - // K path - mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var}); - eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var}); - reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var}); - transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var}); - // compute q*k - matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var}) - .LinksTo({matmul_qk_out_var}); - eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) - .LinksTo({eltadd_qk_out_var}); - softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); - // V path - mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var}); - eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var}); - reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var}); - transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var}); - // compute q*k*v - matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) - .LinksTo({matmul_qkv_out_var}); - transpose2_qkv->LinksFrom({matmul_qkv_out_var}) - .LinksTo({transpose2_qkv_out_var}); - reshape2_qkv->LinksFrom({transpose2_qkv_out_var}) - .LinksTo({reshape2_qkv_out_var}); + int fusion_count = BuildFusionV2(graph, name_scope_, scope); + if (fusion_count > 0) { + graph->Set(kMultiheadMatmulPass, new bool(true)); + } + AddStatis(fusion_count); +} - return transpose2_2_out_var; +MultiHeadMatmulV3FusePass::MultiHeadMatmulV3FusePass() { + AddOpCompat(OpCompat("mul")) + .AddInput("X") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddInput("Y") // the shape shoule be (N*H, N*H) + .IsTensor() + .End() + .AddOutput("Out") // the shape shoule be (B, S, N*H) + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(2) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + // in bias, shape is (B, S, N*H), + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + .AddInput("Y") + // in bias, shape is (N*H) + // in biasqk, shape is (B, H, S, S) + .IsTensor() + .End() + // in bias, shape is (B, S, N*H) + // in biasqk, shape is (B, H, S, S) + .AddOutput("Out") + .IsTensor() + .End() + // in bias, it equal to 2 + // in biasqk, it equal to -1 or 0 + .AddAttr("axis") + .IsIntIn({2, -1, 0}) + .End(); + + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("shape") // -->(B, S, H, N) <--(B, S, N*H) + .IsType>() + .End(); + + // -->: (B, S, H, N) -> (B, H, S, N) + // <--: (B, H, S, N) -> (B, S, H, N) + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") // {0, 2, 1, 3} + .IsType>() + .End(); + + // QK (B, H, S, N)*(B, H, S, N) -> (B, H, S, S) + // QKV (B, H, S, S)*(B, H, S, N) -> (B, H, S, N) + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsType() // QK(anyvalue, will copy to new op) QKV(1.0) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") // QK(true) QKV(false) + .IsType() + .End(); + + AddOpCompat(OpCompat("softmax")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 3}) // shape is (B, H, S, S), so axis is -1 or 3 + .End(); } -static int BuildFusionV3(Graph* graph, const std::string& name_scope, - Scope* scope) { +int MultiHeadMatmulV3FusePass::BuildFusionV3(Graph* graph, + const std::string& name_scope, + Scope* scope) const { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); // Create pattern. - MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope); + patterns::MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope); multihead_pattern(); // Create New OpDesc @@ -1155,30 +1426,6 @@ static int BuildFusionV3(Graph* graph, const std::string& name_scope, return fusion_count; } -} // namespace patterns - -void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const { - FusePassBase::Init(name_scope_, graph); - - int fusion_count = patterns::BuildFusion(graph, name_scope_); - AddStatis(fusion_count); -} - -void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const { - FusePassBase::Init(name_scope_, graph); - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, - platform::errors::Fatal( - "During the multiheadMatmul pass, The scope should not be null.")); - - int fusion_count = patterns::BuildFusionV2(graph, name_scope_, scope); - if (fusion_count > 0) { - graph->Set(kMultiheadMatmulPass, new bool(true)); - } - AddStatis(fusion_count); -} - void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const { FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); @@ -1187,7 +1434,7 @@ void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const { platform::errors::Fatal( "During the multiheadMatmul pass, The scope should not be null.")); - int fusion_count = patterns::BuildFusionV3(graph, name_scope_, scope); + int fusion_count = BuildFusionV3(graph, name_scope_, scope); if (fusion_count > 0) { graph->Set(kMultiheadMatmulPass, new bool(true)); } diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h index c7f1336211d3463846a61b998c4f12f11095de32..c39823e7325c191d52f7af5bc111c62956c6db94 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h @@ -18,16 +18,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" - -namespace paddle { -namespace framework { -namespace ir { -class Graph; -} // namespace ir -} // namespace framework -} // namespace paddle namespace paddle { namespace framework { @@ -158,22 +148,30 @@ class MultiHeadMatmulFusePass : public FusePassBase { class MultiHeadMatmulV2FusePass : public FusePassBase { public: - virtual ~MultiHeadMatmulV2FusePass() {} + MultiHeadMatmulV2FusePass(); protected: void ApplyImpl(Graph* graph) const; const std::string name_scope_{"multihead_matmul_fuse_v2"}; + + private: + int BuildFusionV2(Graph* graph, const std::string& name_scope, + Scope* scope) const; }; class MultiHeadMatmulV3FusePass : public FusePassBase { public: - virtual ~MultiHeadMatmulV3FusePass() {} + MultiHeadMatmulV3FusePass(); protected: void ApplyImpl(Graph* graph) const; const std::string name_scope_{"multihead_matmul_fuse_v3"}; + + private: + int BuildFusionV3(Graph* graph, const std::string& name_scope, + Scope* scope) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc index 2eda643d4e53aa061908f02c9d31b765241c318b..b121436ee870b36052ae6195c26cadd90a299559 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass_tester.cc @@ -64,7 +64,7 @@ TEST(MultiHeadMatmulFusePass, basic) { // (transpose_qkv) reshape -> reshape_qkv // (reshape_qkv) mul -> mul_qkv Layers layers; - auto* x = layers.data("x", {128, 768}); + auto* x = layers.data("x", {1, 128, 768}); auto out = layers.layer_norm(x); auto* layer_out = out[0]; @@ -72,41 +72,41 @@ TEST(MultiHeadMatmulFusePass, basic) { auto* weights_1 = layers.data("weights1", {768, 768}, true); auto* weights_2 = layers.data("weights2", {768, 768}, true); - auto* mul_out_0 = layers.mul(layer_out, weights_0); - auto* mul_out_1 = layers.mul(layer_out, weights_1); - auto* mul_out_2 = layers.mul(layer_out, weights_2); + auto* mul_out_0 = layers.mul(layer_out, weights_0, nullptr, 2); + auto* mul_out_1 = layers.mul(layer_out, weights_1, nullptr, 2); + auto* mul_out_2 = layers.mul(layer_out, weights_2, nullptr, 2); auto* b0 = layers.data("bias_0", {768}, true); auto* b1 = layers.data("bias_1", {768}, true); auto* b2 = layers.data("bias_2", {768}, true); - auto* elementwise_out_0 = layers.elementwise_add(mul_out_0, b0); - auto* elementwise_out_1 = layers.elementwise_add(mul_out_1, b1); - auto* elementwise_out_2 = layers.elementwise_add(mul_out_2, b2); + auto* elementwise_out_0 = layers.elementwise_add(mul_out_0, b0, nullptr, 2); + auto* elementwise_out_1 = layers.elementwise_add(mul_out_1, b1, nullptr, 2); + auto* elementwise_out_2 = layers.elementwise_add(mul_out_2, b2, nullptr, 2); - std::vector shape = {128, 12, 64}; - auto* reshape_0 = layers.reshape2(elementwise_out_0, shape); - auto* reshape_1 = layers.reshape2(elementwise_out_1, shape); - auto* reshape_2 = layers.reshape2(elementwise_out_2, shape); + std::vector shape = {1, 128, 12, 64}; + auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true); + auto* reshape_1 = layers.reshape2(elementwise_out_1, shape, true); + auto* reshape_2 = layers.reshape2(elementwise_out_2, shape, true); std::vector axis = {0, 2, 1, 3}; - auto* transpose_0 = layers.transpose2(reshape_0, axis); - auto* transpose_1 = layers.transpose2(reshape_1, axis); - auto* transpose_2 = layers.transpose2(reshape_2, axis); + auto* transpose_0 = layers.transpose2(reshape_0, axis, true); + auto* transpose_1 = layers.transpose2(reshape_1, axis, true); + auto* transpose_2 = layers.transpose2(reshape_2, axis, true); auto* scale_0 = layers.scale(transpose_0, 0.125, 0, false); - auto* matmul_qk = layers.matmul(scale_0, transpose_1); + auto* matmul_qk = layers.matmul(scale_0, transpose_1, nullptr, false, true); - auto* bqk = layers.data("biasqk", {768}, true); + auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true); auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk); auto* softmax_qk = layers.softmax(elementwise_qk, -1); auto* matmul_qkv = layers.matmul(softmax_qk, transpose_2); - auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}); - auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {128, 768}); + auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true); + auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 768}, true); auto* weights_l = layers.data("weightsl", {768, 768}, true); - layers.mul(reshape_qkv_out, weights_l); + layers.mul(reshape_qkv_out, weights_l, nullptr, 2); std::unique_ptr graph(new ir::Graph(layers.main_program())); graph->Set("__param_scope__", CreateParamScope()); diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..8f814822b6a4b6b2cd3173791c2119e220895950 --- /dev/null +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.cc @@ -0,0 +1,301 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/op_def_api.h" +#include "paddle/fluid/framework/op_info.h" + +namespace { +std::unordered_set global_extra_attrs = { + "op_role", "op_role_var", "op_namescope", + "op_callstack", "op_device", "@ENABLE_CACHE_RUNTIME_CONTEXT@", + "is_test", "use_mkldnn", "mkldnn_data_type", + "use_quantizer", "mkldnn_data_type", "use_cudnn", + "name"}; +} + +namespace paddle { +namespace framework { +namespace ir { + +AttrCompat& AttrCompat::IsStringEQ(const std::string& value) { + conditions_.emplace_back([value](const Attribute& attr) -> bool { + return value == BOOST_GET_CONST(std::string, attr); + }); + return *this; +} + +AttrCompat& AttrCompat::IsStringIn(const std::set& candidates) { + conditions_.emplace_back([candidates](const Attribute& attr) -> bool { + std::string value = BOOST_GET_CONST(std::string, attr); + for (auto& str : candidates) { + if (str == value) { + return true; + } + } + return false; + }); + return *this; +} + +AttrCompat& AttrCompat::IsStringMatch( + const std::function& func) { + conditions_.emplace_back([func](const Attribute& attr) -> bool { + std::string value = BOOST_GET_CONST(std::string, attr); + return func(value); + }); + return *this; +} + +AttrCompat& AttrCompat::IsIntIn(const std::set& candidates) { + conditions_.emplace_back([candidates](const Attribute& attr) -> bool { + int value = BOOST_GET_CONST(int, attr); + return candidates.find(value) != candidates.end(); + }); + return *this; +} + +AttrCompat& AttrCompat::IsLeftDefault() { + const std::string& op_name = op_compat_->Name(); + if (!OpInfoMap::Instance().Has(op_name)) { + LOG(WARNING) << "Op (" << op_name << ") is not registered!"; + conditions_.emplace_back([](const Attribute& attr) { return false; }); + return *this; + } + const OpInfo& op_info = OpInfoMap::Instance().Get(op_name); + const AttributeMap attrs = op_info.Checker()->GetDefaultAttrsMap(); + if (attrs.find(attr_name_) == attrs.end()) { + LOG(WARNING) << "Op (" << op_name << ") has no default attr:" << attr_name_; + conditions_.emplace_back([](const Attribute& attr) { return false; }); + } else { + Attribute default_attr = attrs.at(attr_name_); + conditions_.emplace_back([default_attr](const Attribute& attr) -> bool { + return attr == default_attr; + }); + } + return *this; +} + +bool AttrCompat::operator()(const OpDesc& op_desc) { + if (!op_desc.HasAttr(attr_name_)) { + if (!optional_) { + LOG(WARNING) << "The non-optional Attr(" << attr_name_ << ") of Op (" + << op_compat_->Name() << ") not find ! "; + } + return optional_; + } + const Attribute attr = op_desc.GetAttr(attr_name_); + for (auto& func : conditions_) { + if (!func(attr)) { + return false; + } + } + return true; +} +AttrCompat& AttrCompat::IsOptional() { + optional_ = true; + return *this; +} + +AttrCompat& AttrCompat::IsBoolEQ(bool v) { + conditions_.emplace_back([v](const Attribute& attr) -> bool { + bool value = BOOST_GET_CONST(bool, attr); + return value == v; + }); + return *this; +} + +InputOrOutputCompat& InputOrOutputCompat::IsTensor() { + conditions_.emplace_back([](const std::vector& input) -> bool { + return input.size() == 1u; + }); + return *this; +} + +InputOrOutputCompat& InputOrOutputCompat::IsOptional() { + optional_ = true; + return *this; +} + +bool InputOrOutputCompat::operator()( + const std::vector& input) const { + if (input.empty()) return optional_; + for (auto& func : conditions_) { + if (!func(input)) { + return false; + } + } + return true; +} + +AttrCompat& OpCompat::AddAttr(const std::string& attr_name) { + PADDLE_ENFORCE_EQ( + attr_compats_.find(attr_name), attr_compats_.end(), + platform::errors::InvalidArgument( + "The attrubute compat with the same name has been added")); + attr_compats_.emplace(attr_name, AttrCompat(attr_name, this)); + return attr_compats_.at(attr_name); +} + +InputOrOutputCompat& OpCompat::AddInput(const std::string& name) { + PADDLE_ENFORCE_EQ(input_compats_.find(name), input_compats_.end(), + platform::errors::InvalidArgument( + "The input with the same name has been added")); + input_compats_.emplace(name, InputOrOutputCompat(name, this)); + return input_compats_.at(name); +} + +InputOrOutputCompat& OpCompat::AddOutput(const std::string& name) { + PADDLE_ENFORCE_EQ(output_compats_.find(name), output_compats_.end(), + platform::errors::InvalidArgument( + "The output with the same name has been added")); + output_compats_.emplace(name, InputOrOutputCompat(name, this)); + return output_compats_.at(name); +} + +bool OpCompat::Judge(const OpDesc& op_desc) { + if (is_first_judge_) { + is_first_judge_ = false; + const proto::OpDef& op_def = GetOpDef(op_name_); + if (op_def.has_extra()) { + for (const proto::OpDef_AttrDef& attr : op_def.extra().attrs()) { + extra_attrs_.emplace(attr.name()); + } + } + } + + for (auto& attr_map : op_desc.GetAttrMap()) { + const std::string& name = attr_map.first; + if (name.size() >= 10u && + 0 == name.compare(name.size() - 10u, 10u, "_threshold")) { + continue; // skip the attribute ends with "_threshold", it used for + // quantization. + } + if (attr_compats_.find(attr_map.first) == attr_compats_.end()) { + if (global_extra_attrs.find(attr_map.first) != global_extra_attrs.end() || + extra_attrs_.find(attr_map.first) != extra_attrs_.end()) { + continue; + } + if (!AttrCompat(attr_map.first, this).IsLeftDefault()(op_desc)) { + LOG(WARNING) + << "The Attr(" << attr_map.first << ") of Op (" << op_name_ + << ") not reigistered in OpCompat, not in extra attribute, not " + "equal to default value!"; + return false; + } + } + } + + for (auto& attr_compat : attr_compats_) { + if (!attr_compat.second(op_desc)) { + LOG(WARNING) << " Check the Attr(" << attr_compat.first << ") of Op(" + << op_name_ << ") failed!"; + return false; + } + } + + const VariableNameMap& inputs_map = op_desc.Inputs(); + for (auto& input_desc : inputs_map) { + if (input_compats_.find(input_desc.first) == input_compats_.end()) { + if (!input_desc.second.empty()) { + LOG(WARNING) << "The Input (" << input_desc.first << ") of Operator (" + << op_name_ << ") not reigistered in OpCompat!"; + return false; + } + } + } + for (auto& input_val : input_compats_) { + if (inputs_map.find(input_val.first) == inputs_map.end()) { + if (!input_val.second.Optional()) { + LOG(WARNING) << "The No optional Input (" << input_val.first + << ") of Operator (" << op_name_ + << ") not find in op_desc!"; + return false; + } + } else { + if (!input_val.second(inputs_map.at(input_val.first))) { + LOG(WARNING) << "The Input (" << input_val.first << ") of Operator (" + << op_name_ << ") compat check failed!"; + return false; + } + } + } + + const VariableNameMap& outputs_map = op_desc.Outputs(); + for (auto& output_desc : outputs_map) { + if (output_compats_.find(output_desc.first) == output_compats_.end()) { + if (!output_desc.second.empty()) { + LOG(WARNING) << "The Output (" << output_desc.first << ") of Operator (" + << op_name_ << ") not reigistered in OpCompat!"; + return false; + } + } + } + for (auto& output_val : output_compats_) { + if (outputs_map.find(output_val.first) == outputs_map.end()) { + if (!output_val.second.Optional()) { + LOG(WARNING) << "The No optional Output (" << output_val.first + << ") of Operator (" << op_name_ + << ") not find in op_desc!"; + return false; + } + } else { + if (!output_val.second(outputs_map.at(output_val.first))) { + LOG(WARNING) << "The Output (" << output_val.first << ") of Operator (" + << op_name_ << ") compat check failed!"; + return false; + } + } + } + return true; +} + +OpCompat& OpCompatSensiblePass::AddOpCompat(OpCompat&& op_compat) { + std::string name = op_compat.Name(); + op_compat_judgers_[name].reset(new OpCompat(std::move(op_compat))); + return *(op_compat_judgers_[name]); +} + +//! Tell the Op compability of a subgraph. +bool OpCompatSensiblePass::IsCompat( + const GraphPatternDetector::subgraph_t& subgraph, Graph*) const { + PADDLE_ENFORCE_EQ(op_compat_judgers_.empty(), false, + platform::errors::InvalidArgument( + "At least one OpCompat instance should be added")); + // Check the all the ops in the subgraph are contained in the + // op_compat. + for (auto& node_pair : subgraph) { + if (!node_pair.second->IsOp()) continue; + auto op_type = node_pair.second->Op()->Type(); + if (!op_compat_judgers_.count(op_type)) { + if (HasOpDef(op_type)) { + LOG(WARNING) << op_type << " compat not registered!"; + return false; + } + continue; + } + auto& judger = *op_compat_judgers_.at(op_type); + if (!judger.Judge(*(node_pair.second->Op()))) { + return false; + } + } + return true; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass.h b/paddle/fluid/framework/ir/op_compat_sensible_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..cfec1f123e238e249f7b76004b916491b347f3bd --- /dev/null +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass.h @@ -0,0 +1,279 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class OpCompat; + +class AttrCompat { + public: + AttrCompat(const std::string& attr_name, OpCompat* op_compat) + : optional_(false), attr_name_(attr_name), op_compat_(op_compat) {} + + //! Assert the attribute type is `T`. + template + AttrCompat& IsType(); + + // @{ String-related methods + //! Assert the attribute is an string in the `candidates` domain. + AttrCompat& IsStringEQ(const std::string& value); + //! Assert the attribute is an string in the `candidates` domain. + AttrCompat& IsStringIn(const std::set& candidates); + //! Assert the attribute is a string and match a custom judging function. + AttrCompat& IsStringMatch( + const std::function& func); + // @} + + //! Assert the attribute is an integer in the `candidates` domain. + AttrCompat& IsIntIn(const std::set& candidates); + + // @{ Number-releated methods + //! Assert the attribute is a number and > `v`. + template + AttrCompat& IsNumGT(T v); + //! Assert the attribute is a number and >= `v`. + template + AttrCompat& IsNumGE(T v); + //! Assert the attribute is a number and < `v`. + template + AttrCompat& IsNumLT(T v); + //! Assert the attribute is a number and <= `v`. + template + AttrCompat& IsNumLE(T v); + //! Assert the attribute is a number and == `v`. + template + AttrCompat& IsNumEQ(T v); + //! Assert the attribute is a number and matches a customized judging + //! function. + template + AttrCompat& IsNumMatch(bool (*func)(T)); + // @} + + //! Assert the attribute is a boolean value equals `v`. + AttrCompat& IsBoolEQ(bool v); + + //! Tell whether this attribute is left as default value. + AttrCompat& IsLeftDefault(); + + AttrCompat& IsOptional(); + + //! Jump back to retrieve OpCompat instance. + OpCompat& End() { return *op_compat_; } + + bool operator()(const OpDesc& op_desc); + + private: + bool optional_; + std::string attr_name_; + OpCompat* op_compat_; + std::vector> conditions_; +}; + +class InputOrOutputCompat { + public: + InputOrOutputCompat(const std::string& name, OpCompat* op_compat) + : optional_(false), name_(name), op_compat_(op_compat) {} + + InputOrOutputCompat& IsTensor(); + InputOrOutputCompat& IsOptional(); + bool Optional() const { return optional_; } + bool operator()(const std::vector& input) const; + + //! Jump back to retrieve OpCompat instance. + OpCompat& End() { return *op_compat_; } + + private: + bool optional_; + std::string name_; + OpCompat* op_compat_; + std::vector&)>> conditions_; +}; + +/** + * OpCompat is a helper class to help define the compatible Op definition. + * + * Usage: + * OpCompat compat("FC"); + * compat.AddAttr("in_num_col_dims").IsNumLE(1).End() + * .AddAttr("activation_type").IsStringIn({"tanh", "sigmoid"}).End() + * .AddInput("Input").IsTensor().End() + * .AddInput("W").IsTensor().End() + * .AddInput("Bias").IsTensor().IsOptional().End() + * .AddOutput("Out").IsTensor().End() + * + * All the inference-aware Op defition is as above, all the other attributes not + * contained in the definition should be set default value or it would be judged + * incompatible. + */ +class OpCompat { + public: + explicit OpCompat(const std::string& op_name) : op_name_(op_name) {} + explicit OpCompat(std::string&& op_name) : op_name_(std::move(op_name)) {} + explicit OpCompat(const OpCompat&) = default; + explicit OpCompat(OpCompat&&) = default; + + AttrCompat& AddAttr(const std::string& attr_name); + InputOrOutputCompat& AddInput(const std::string& name); + InputOrOutputCompat& AddOutput(const std::string& name); + + //! Judge whether an OpDesc match the defined Op compatibility. + bool Judge(const OpDesc& op_desc); + const std::string& Name() const { return op_name_; } + + private: + std::string op_name_; + std::unordered_map attr_compats_; + std::unordered_map input_compats_; + std::unordered_map output_compats_; + std::unordered_set extra_attrs_; + bool is_first_judge_ = true; +}; + +/** + * OpCompatSensiblePass is a base class for all the passes thouse is sensitive + * to Op update. + * There are two methods to help tell the compability of an Op + * bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph, Graph* g); + * bool IsCompat(const OpDesc& op_desc); + * + * One can register the related Op compabilities using + * void AddOpCompat(OpCompat&& judger); + * + * Most of the Passes are used for fusing ops, so we define a method for such + * scenerios. + * void AccessSubgraph(const GraphPatternDetector::subgraph_t& subgraph, + Graph* g); + * It will check the Op compatibility automatically. + * For other scenirios, one should call `IsCompat` by himself. + * + * A FC fuse pass example: + * class FcFusePass : public OpCompatSensiblePass { + * public: + * FcFusePass() { + * // define Mul op compatiblity. + * AddOpCompat(OpCompat("Mul")) + * .AddInput("Input").IsTensor().End() + * .AddAttr("in_num_col_dims").IsNumGE(1); + * AddOpCompat(OpCompat("Add")). ...; + * // There are multiple activation implemention. + * AddOpCompat(OpCompat("Tanh")). ...; + * AddOpCompat(OpCompat("Sigmoid")). ...; + * } + * + * // override the subgraph access method + * virtual bool AccessSubgraphImpl( + * const GraphPatternDetector::subgraph_t& subgraph, + * Graph* g) override { ... } + * + * // Call the AccessSubgraph method in main procedure of this Pass. + * }; + */ +class OpCompatSensiblePass : public Pass { + protected: + /** + * Developer should push the compatibility `teller` for each kind of Op in the + * subgraph. + * NOTE One should add all the related op compatiblity in the construct so + * that all the following methods are valid. + */ + OpCompat& AddOpCompat(OpCompat&& op_compat); + + //! Tell the Op compability of a subgraph. + bool IsCompat(const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) const; + + //! Tell the op compatibility of a single Op. + bool IsCompat(const OpDesc& op_desc) const { + if (!op_compat_judgers_.count(op_desc.Type())) return false; + return op_compat_judgers_.at(op_desc.Type())->Judge(op_desc); + } + + private: + std::map> op_compat_judgers_; +}; + +template +AttrCompat& AttrCompat::IsType() { + conditions_.emplace_back( + [](const Attribute& attr) -> bool { return attr.type() == typeid(T); }); + return *this; +} + +template +AttrCompat& AttrCompat::IsNumGT(T v) { + conditions_.emplace_back([v](const Attribute& attr) -> bool { + T value = BOOST_GET_CONST(T, attr); + return value > v; + }); + return *this; +} + +template +AttrCompat& AttrCompat::IsNumGE(T v) { + conditions_.emplace_back([v](const Attribute& attr) -> bool { + T value = BOOST_GET_CONST(T, attr); + return value >= v; + }); + return *this; +} + +template +AttrCompat& AttrCompat::IsNumLT(T v) { + conditions_.emplace_back([v](const Attribute& attr) -> bool { + T value = BOOST_GET_CONST(T, attr); + return value < v; + }); + return *this; +} + +template +AttrCompat& AttrCompat::IsNumLE(T v) { + conditions_.emplace_back([v](const Attribute& attr) -> bool { + T value = BOOST_GET_CONST(T, attr); + return value <= v; + }); + return *this; +} + +template +AttrCompat& AttrCompat::IsNumEQ(T v) { + conditions_.emplace_back([v](const Attribute& attr) -> bool { + T value = BOOST_GET_CONST(T, attr); + return value == v; + }); + return *this; +} + +template +AttrCompat& AttrCompat::IsNumMatch(bool (*func)(T)) { + conditions_.emplace_back([func](const Attribute& attr) -> bool { + T value = BOOST_GET_CONST(T, attr); + return func(value); + }); + return *this; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..9074a9876f9f7d200d4c464fdab57b641c1d3b5a --- /dev/null +++ b/paddle/fluid/framework/ir/op_compat_sensible_pass_tester.cc @@ -0,0 +1,218 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +TEST(OpCompatSensiblePass, compatOp) { + auto lambda = [](const std::string& str) { return str == "tanh"; }; + OpCompat compat("fc"); + compat.AddAttr("in_num_col_dims") + .IsIntIn({1, 2}) + .IsNumLE(1) + .End() + .AddAttr("activation_type") + .IsStringIn({"tanh", "sigmoid"}) + .IsStringMatch(lambda) + .End() + .AddAttr("test_attr") + .IsBoolEQ(true) + .End() + .AddInput("Input") + .IsTensor() + .End() + .AddInput("W") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("Test") + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + OpDesc fc_op; + + std::unordered_map attr_map; + attr_map["in_num_col_dims"] = 1; + attr_map["activation_type"] = std::string("tanh"); + attr_map["test_attr"] = true; + + fc_op.SetAttrMap(attr_map); + + fc_op.SetInput("Input", std::vector{"test_input"}); + fc_op.SetInput("W", std::vector{"test_input_0"}); + fc_op.SetInput("Bias", std::vector{"test_input_1"}); + fc_op.SetOutput("Out", std::vector{"test_output"}); + + EXPECT_STREQ(compat.Name().c_str(), "fc"); + EXPECT_TRUE(compat.Judge(fc_op)); +} + +TEST(OpCompatSensiblePass, compatOpAttribute) { + OpCompat compat("fc"); + + OpDesc fc_op; + + std::unordered_map attr_map; + attr_map["in_num_col_dims"] = 1; + fc_op.SetAttrMap(attr_map); + + OpInfo info; + info.checker_ = new OpAttrChecker(); + OpInfoMap::Instance().Insert("fc", info); + + EXPECT_FALSE(compat.Judge(fc_op)); + + info.checker_->AddAttrChecker("in_num_col_dims").SetDefault(1); + + EXPECT_TRUE(compat.Judge(fc_op)); + delete info.checker_; +} + +TEST(OpCompatSensiblePass, opDefNotFound) { + OpCompat compat("fc_1"); + + OpDesc fc_op; + + compat.Judge(fc_op); + + OpCompat compat_1(""); + + compat_1.Judge(fc_op); +} + +TEST(OpCompatSensiblePass, compatOpAttributeOptional) { + OpCompat compat("fc"); + compat.AddAttr("activation_type") + .IsOptional() + .IsStringIn({"tanh", "sigmoid"}); + OpDesc fc_op; + EXPECT_TRUE(compat.Judge(fc_op)); +} + +TEST(OpCompatSensiblePass, compatOpInput) { + OpCompat compat("fc"); + + OpDesc fc_op; + fc_op.SetInput("Input", std::vector{"test_input"}); + + EXPECT_FALSE(compat.Judge(fc_op)); + + compat.AddInput("Input").IsTensor().End().AddInput("Bias").IsTensor().End(); + EXPECT_FALSE(compat.Judge(fc_op)); + + fc_op.SetInput("Bias", std::vector{"test_input", ""}); + EXPECT_FALSE(compat.Judge(fc_op)); +} + +TEST(OpCompatSensiblePass, compatOutput) { + OpCompat compat("fc"); + + OpDesc fc_op; + fc_op.SetOutput("Output", std::vector{"test_output"}); + + EXPECT_FALSE(compat.Judge(fc_op)); + + compat.AddOutput("Output") + .IsTensor() + .End() + .AddOutput("Output_2") + .IsTensor() + .End(); + EXPECT_FALSE(compat.Judge(fc_op)); + + fc_op.SetOutput("Output_2", std::vector{"test_output", ""}); + EXPECT_FALSE(compat.Judge(fc_op)); +} + +class OpCompatSensiblePassTest : public OpCompatSensiblePass { + public: + OpCompatSensiblePassTest(); + bool TestIsCompat(const OpDesc& op_desc) { return IsCompat(op_desc); } + bool TestIsCompat(const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + return IsCompat(subgraph, g); + } +}; + +OpCompatSensiblePassTest::OpCompatSensiblePassTest() { + AddOpCompat(OpCompat("fc")) + .AddAttr("in_num_col_dims") + .IsNumLE(1) + .End() + .AddAttr("activation_type") + .IsStringIn({"tanh", "sigmoid"}) + .End() + .AddInput("Input") + .IsTensor() + .End() + .AddInput("W") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor(); +} + +TEST(OpCompatSensiblePass, IsCompat) { + OpCompatSensiblePassTest test; + OpDesc fc_op; + fc_op.SetType("fc"); + std::unordered_map attr_map; + attr_map["in_num_col_dims"] = 1; + attr_map["activation_type"] = std::string("tanh"); + + fc_op.SetAttrMap(attr_map); + fc_op.SetInput("Input", std::vector{"test_input"}); + fc_op.SetInput("W", std::vector{"test_input_0"}); + fc_op.SetInput("Bias", std::vector{"test_input_1"}); + fc_op.SetOutput("Out", std::vector{"test_output"}); + + EXPECT_TRUE(test.TestIsCompat(fc_op)); +} + +TEST(OpCompatSensiblePass, IsCompatFail) { + OpCompatSensiblePassTest test; + GraphPatternDetector::subgraph_t subgraph; + PDPattern pattern; + PDNode* pd_node = pattern.NewNode(); + ProgramDesc prog; + Graph g(prog); + OpDesc fc_op; + fc_op.SetType("op1"); + subgraph[pd_node] = g.CreateOpNode(&fc_op); + EXPECT_TRUE(test.TestIsCompat(subgraph, &g)); + + fc_op.SetType("mul"); + subgraph[pd_node] = g.CreateOpNode(&fc_op); + EXPECT_FALSE(test.TestIsCompat(subgraph, &g)); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h index 6b187e538d1c082dec47144ed144a746794767b9..284e54b3cb9f30b4d93fadc918634d22234fc69c 100644 --- a/paddle/fluid/framework/ir/pass_tester_helper.h +++ b/paddle/fluid/framework/ir/pass_tester_helper.h @@ -39,28 +39,49 @@ struct Layers { } VarDesc* conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias, - bool use_cudnn = false) { + int groups = 1, std::vector strides = {1, 1}, + std::vector paddings = {0, 0}, + std::string padding_algorithm = "EXPLICIT", + std::vector dilations = {1, 1}, + std::string data_format = "NCHW", bool use_cudnn = false) { VarDesc* out = lod_tensor(unique_name()); OpDesc* op = program_.MutableBlock(0)->AppendOp(); op->SetType("conv2d"); op->SetInput("Input", {input->Name()}); op->SetInput("Filter", {filter->Name()}); op->SetInput("Bias", {bias->Name()}); - op->SetOutput("Out", {out->Name()}); + op->SetOutput("Output", {out->Name()}); op->SetAttr("use_cudnn", use_cudnn); + op->SetAttr("groups", groups); + op->SetAttr("strides", strides); + op->SetAttr("paddings", paddings); + op->SetAttr("padding_algorithm", padding_algorithm); + op->SetAttr("dilations", dilations); + op->SetAttr("data_format", data_format); op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kForward)); return out; } - VarDesc* conv2d_transpose(VarDesc* input, VarDesc* filter, VarDesc* bias) { + VarDesc* conv2d_transpose(VarDesc* input, VarDesc* filter, VarDesc* bias, + int groups = 1, std::vector strides = {1, 1}, + std::vector paddings = {0, 0}, + std::string padding_algorithm = "EXPLICIT", + std::vector dilations = {1, 1}, + std::string data_format = "NCHW") { VarDesc* out = lod_tensor(unique_name()); OpDesc* op = program_.MutableBlock(0)->AppendOp(); op->SetType("conv2d_transpose"); op->SetInput("Input", {input->Name()}); op->SetInput("Filter", {filter->Name()}); op->SetInput("Bias", {bias->Name()}); - op->SetOutput("Out", {out->Name()}); + op->SetOutput("Output", {out->Name()}); + op->SetAttr("groups", groups); + op->SetAttr("strides", strides); + op->SetAttr("paddings", paddings); + op->SetAttr("padding_algorithm", padding_algorithm); + op->SetAttr("dilations", dilations); + op->SetAttr("data_format", data_format); op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), static_cast(OpRole::kForward)); return out; @@ -194,14 +215,21 @@ struct Layers { } VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr, - int x_num_col_dims = 1) { + int x_num_col_dims = 1, int y_num_col_dims = 1, + bool use_mkldnn = false) { AttributeMap attrs; - attrs["x_num_col_dims"] = 1; + attrs["x_num_col_dims"] = x_num_col_dims; + attrs["y_num_col_dims"] = y_num_col_dims; + attrs["use_mkldnn"] = use_mkldnn; return binary_op("mul", x, y, out, &attrs); } - VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) { - return binary_op("elementwise_add", x, y, out); + VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr, + int axis = -1, bool use_mkldnn = false) { + AttributeMap attrs; + attrs["axis"] = axis; + attrs["use_mkldnn"] = use_mkldnn; + return binary_op("elementwise_add", x, y, out, &attrs); } VarDesc* elementwise_mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr, @@ -265,13 +293,17 @@ struct Layers { return outs; } - VarDesc* matmul(VarDesc* x, VarDesc* y, VarDesc* alpha = nullptr) { + VarDesc* matmul(VarDesc* x, VarDesc* y, VarDesc* alpha = nullptr, + bool transpose_x = false, bool transpose_y = false) { VarDesc* out = lod_tensor(unique_name()); OpDesc* op = program_.MutableBlock(0)->AppendOp(); op->SetType("matmul"); op->SetInput("X", {x->Name()}); op->SetInput("Y", {y->Name()}); op->SetOutput("Out", {out->Name()}); + op->SetAttr("transpose_X", transpose_x); + op->SetAttr("transpose_Y", transpose_y); + op->SetAttr("alpha", 1.0f); return out; } diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 2fc39fd25d56c18ac510b550186eccaeb6eb9030..60675bf84886398fb2b56d3e7e10b4dc69517a54 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -21,11 +21,216 @@ namespace paddle { namespace framework { namespace ir { - +QuantDequantFusePass::QuantDequantFusePass() { + AddOpCompat(OpCompat("fake_quantize_range_abs_max")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("InScale") + .IsTensor() + .End() + .AddInput("Iter") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("OutScale") + .IsTensor() + .End() + .AddOutput("OutScales") + .IsTensor() + .End() + .AddAttr("window_size") + .IsType() + .IsNumGT(0) + .End() + .AddAttr("bit_length") + .IsIntIn({8, 16}) + .End(); + AddOpCompat(OpCompat("fake_quantize_moving_average_abs_max")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("InScale") + .IsTensor() + .End() + .AddInput("InAccum") + .IsTensor() + .IsOptional() + .End() + .AddInput("InState") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("OutScale") + .IsTensor() + .End() + .AddOutput("OutState") + .IsTensor() + .IsOptional() + .End() + .AddOutput("OutAccum") + .IsTensor() + .IsOptional() + .End() + .AddAttr("moving_rate") + .IsType() + .IsNumGT(0.0f) + .End() + .AddAttr("bit_length") + .IsIntIn({8, 16}) + .End(); + AddOpCompat(OpCompat("fake_dequantize_max_abs")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("max_range") + .IsType() + .IsNumGT(0.0f) + .End(); + AddOpCompat(OpCompat("fake_channel_wise_dequantize_max_abs")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scales") // "Scales" is a vector with at most two tensors + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("quant_bits") + .IsType>() + .End() + .AddAttr("quant_axis") + .IsIntIn({0, 1}) + .IsOptional() + .End(); + AddOpCompat(OpCompat("conv2d")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddInput("ResidualData") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsOptional() + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + AddOpCompat(OpCompat("fc")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("W") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("in_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("activation_type") + .IsStringIn({"relu", ""}) + .End(); + AddOpCompat(OpCompat("conv2d_transpose")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Output") + .IsTensor() + .End() + .AddAttr("output_padding") + .IsType>() + .IsOptional() + .End() + .AddAttr("output_size") + .IsType>() + .IsOptional() + .End() + .AddAttr("groups") + .IsNumGE(1) + .End() + .AddAttr("dilations") + .IsType>() + .End() + .AddAttr("strides") + .IsType>() + .End() + .AddAttr("paddings") + .IsType>() + .End() + .AddAttr("padding_algorithm") + .IsStringIn({"EXPLICIT", "SAME", "VALID"}) + .End() + .AddAttr("data_format") + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) + .End(); +} // Delete quant op before quantized ops, and set input scale in the attr of // quantized ops -void DeleteQuant(ir::Graph* graph, Scope* scope, - const std::string& quant_type) { +void QuantDequantFusePass::DeleteQuant(ir::Graph* graph, Scope* scope, + const std::string& quant_type) const { const std::string pattern_name = "delete_quant_fuse"; GraphPatternDetector gpd; auto* input_act_node = gpd.mutable_pattern() @@ -41,6 +246,10 @@ void DeleteQuant(ir::Graph* graph, Scope* scope, // ops linked from it auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } PADDLE_ENFORCE_EQ( subgraph.count(input_act_node), true, platform::errors::NotFound( @@ -103,9 +312,9 @@ void DeleteQuant(ir::Graph* graph, Scope* scope, // Delete dequant op after quantized ops, and convert weight from fp32 range to // int8 range -void FuseDequant(ir::Graph* graph, Scope* scope, - const std::string& quantized_op_type, - const std::string& dequant_type) { +void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, + const std::string& quantized_op_type, + const std::string& dequant_type) const { std::string weight_name = ""; std::string input_name = ""; if (quantized_op_type == "conv2d" || @@ -142,6 +351,10 @@ void FuseDequant(ir::Graph* graph, Scope* scope, // Create new op desc auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } PADDLE_ENFORCE_EQ( subgraph.count(quantized_op_input), true, platform::errors::NotFound("Quantized op input node(%s) did not find " diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h index a16dc7620b428557d7cdf600a2ccfc819fdf3748..521e186c2be4160977a0b1809b0f4c899cb8cefd 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h @@ -16,7 +16,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -25,14 +24,20 @@ namespace ir { /// /// Fuse quant + conv2d/depthwise_conv2d/mul/fc + dequant /// -class Graph; - class QuantDequantFusePass : public FusePassBase { public: + QuantDequantFusePass(); virtual ~QuantDequantFusePass() {} protected: void ApplyImpl(ir::Graph* graph) const override; + + private: + void DeleteQuant(ir::Graph* graph, Scope* scope, + const std::string& quant_type) const; + void FuseDequant(ir::Graph* graph, Scope* scope, + const std::string& quantized_op_type, + const std::string& dequant_type) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc index 479df876fbe007119c55261dd149bd515b0cd117..a03a6f5b2c72c6e7d33c92e11915c15578f54b07 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -31,6 +31,27 @@ namespace paddle { namespace framework { namespace ir { +RepeatedFCReluFusePass::RepeatedFCReluFusePass() { + AddOpCompat(OpCompat("fc")) + .AddInput("Input") + .IsTensor() + .End() + .AddInput("W") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("in_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("activation_type") + .IsStringEQ("relu") + .End(); +} static bool IsInputOfFC(Node* n) { if (n && n->IsVar() && VarLinksToOp(n, "fc")) { return true; @@ -54,10 +75,25 @@ static bool IsFCWithAct(Node* n, const std::string& act_type = "relu") { return false; } +static bool IsFCWithPaddingWeights(Node* n) { + bool res = false; + if (n && n->IsOp() && n->Op() && n->Op()->Type() == "fc" && + n->inputs.size() == 3U && n->outputs.size() == 1U) { + if (n->Op()->HasAttr("padding_weights")) { + res = BOOST_GET_CONST(bool, n->Op()->GetAttr("padding_weights")); + } + } + return res; +} + static bool IsParamOfFC(Node* n, const std::string& param_name) { - if (IsInputOfFC(n) && n->inputs.empty() && - (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) { - return true; + if (IsInputOfFC(n) && n->inputs.empty()) { + for (auto* out : n->outputs) { + if (out->Op()->Type() == "fc" && + n->Name() == out->Op()->Input(param_name)[0]) { + return true; + } + } } return false; } @@ -255,7 +291,7 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, fc_ops[i] = pattern->NewNode( [=](Node* x) { - if (!IsFCWithAct(x, "relu")) { + if (!IsFCWithAct(x, "relu") || IsFCWithPaddingWeights(x)) { return false; } auto* fc_out_var = x->outputs[0]; @@ -280,8 +316,9 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, } } -static int BuildFusion(Graph* graph, const std::string& name_scope, - int num_fc) { +int RepeatedFCReluFusePass::BuildFusion(Graph* graph, + const std::string& name_scope, + int num_fc) const { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); BuildRepeatedFCReluPattern(pattern, name_scope, num_fc); @@ -301,6 +338,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, int fusion_count{0}; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "repeated_fc_relu_fuse_pass failed in op compat."; + return; + } LOG(INFO) << "handle Repeated FC Act fuse"; std::vector weights_vars(num_fc); std::vector bias_vars(num_fc); diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h index 0be217cc748a248f4e5bf8d98922cb8ebdbd3e3c..b2933d26e07ab7a981649fd84c275ce6ddecfce8 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h @@ -31,12 +31,16 @@ class Graph; class RepeatedFCReluFusePass : public FusePassBase { public: - virtual ~RepeatedFCReluFusePass() {} + RepeatedFCReluFusePass(); protected: void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"repeated_fc_relu_fuse"}; + + private: + int BuildFusion(Graph* graph, const std::string& name_scope, + int num_fc) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc index 157fd4d1a4e18fe83e7e74d9b6ddb5970d905d6c..583e45b5742f989b3430bb6a748da43790261c59 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -174,6 +174,91 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { return fc_out; } +SeqConcatFcFusePass::SeqConcatFcFusePass() { + AddOpCompat(OpCompat("sequence_expand")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("ref_level") + .IsNumEQ(0) + .End(); + + AddOpCompat(OpCompat("concat")) + .AddInput("X") // Input("X"): vector + .End() + .AddInput("AxisTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumEQ(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("tanh")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("sigmoid")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} + void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init("seq_concat_fc_fuse", graph); GraphPatternDetector detector; @@ -193,6 +278,10 @@ void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const { detector(graph, [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "seq_concat_fc_fuse_pass in op compat failed."; + return; + } VLOG(4) << "get one concat pattern"; // fc GET_NODE(fc_w, detector.pattern()); diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h index a70411536455757b49292e990d27e372651b88c9..99dcd4455bc1e90a10fa07ef4e85ecb4ac83b6fb 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h @@ -15,8 +15,6 @@ #pragma once #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { @@ -26,6 +24,7 @@ class Graph; class SeqConcatFcFusePass : public FusePassBase { public: + SeqConcatFcFusePass(); virtual ~SeqConcatFcFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc index 9337a67651ee3c16604bfb12314a6d6bb8dce71c..9fa951920f45a311314832cdaa0e61b5319a8551 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc @@ -27,16 +27,65 @@ namespace paddle { namespace framework { namespace ir { +SeqConvEltAddReluFusePass::SeqConvEltAddReluFusePass() { + AddOpCompat(OpCompat("sequence_conv")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Filter") + .IsTensor() + .End() + .AddInput("PaddingData") + .IsOptional() + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("contextLength") + .IsNumGT(0) + .End() + .AddAttr("contextStart") // the contextStart attribute can be negative, + // unconstrained + .End() + .AddAttr("contextStride") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumEQ(1) + .End(); + + AddOpCompat(OpCompat("relu")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); +} + class Node; -int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { +void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); - PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X")) + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope_, "X")) ->assert_is_op_input("sequence_conv") ->assert_var_not_persistable(); - patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope); + patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope_); fuse_pattern(x); // Create New OpDesc @@ -70,6 +119,10 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "handle SeqConv EltAdd Relu fuse"; GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern); GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern); @@ -89,14 +142,6 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { }; gpd(graph, handler); - - return fusion_count; -} - -void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const { - FusePassBase::Init(name_scope_, graph); - - int fusion_count = BuildFusion(graph, name_scope_, param_scope()); AddStatis(fusion_count); } diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h index 6f623625f51d8217370f2eabfb6820eebeb6e07a..fe06002251ae2adefc64c431446f90aad5ea85b4 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h @@ -28,6 +28,7 @@ class Graph; class SeqConvEltAddReluFusePass : public FusePassBase { public: + SeqConvEltAddReluFusePass(); virtual ~SeqConvEltAddReluFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc index 6bff4a05627d3821bae02caa531e580d038479f2..effaa0814ea79e2c6a5cebfe4656916ed5bb796d 100644 --- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc @@ -52,6 +52,52 @@ static void GetConcatNodes(ir::Graph* graph, std::vector* concat_nodes) { } } // anonymous namespace +SeqPoolCVMConcatFusePass::SeqPoolCVMConcatFusePass() { + AddOpCompat(OpCompat("sequence_pool")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("MaxIndex") + .IsTensor() + .IsOptional() + .End() + .AddAttr("pooltype") + .IsStringIn({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"}) + .End() + .AddAttr("pad_value") + .End(); + AddOpCompat(OpCompat("cvm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("CVM") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddAttr("use_cvm") + .IsBoolEQ(true) + .End(); + AddOpCompat(OpCompat("concat")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("AxisTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumGE(1) + .End(); +} + void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init("seqpool_cvm_concat_fuse", graph); std::vector concat_nodes; diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h index b0a3573fb59f975400e43b6ae842c23ad262e2ff..7680c30e485a8eba259b5dd395e9fd12c7283f41 100644 --- a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h @@ -44,7 +44,7 @@ class Graph; class SeqPoolCVMConcatFusePass : public FusePassBase { public: - virtual ~SeqPoolCVMConcatFusePass() {} + SeqPoolCVMConcatFusePass(); protected: void ApplyImpl(ir::Graph* graph) const override; diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc index b9bd660043bf1b0d24cf302bf782ec179245ff6a..1e9598fff87a8e9504db4f60f08b9fd4160e4a58 100644 --- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc +++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc @@ -30,6 +30,44 @@ namespace ir { GET_IR_NODE(reshape2_op); \ GET_IR_NODE(reshape2_out); +ShuffleChannelDetectPass::ShuffleChannelDetectPass() { + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsOptional() + .IsTensor() + .End() + .AddInput("ShapeTensor") + .IsOptional() + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("shape") + .IsType>() + .End(); + + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsType>() + .End(); +} + void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "shufflechannel_pattern"; FusePassBase::Init(pattern_name, graph); @@ -46,7 +84,10 @@ void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { GET_NODES; - + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "The Pass in op compat failed."; + return; + } PADDLE_ENFORCE_GT( subgraph.count(x), 0, platform::errors::NotFound("Detector did not find input X.")); diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h index d0caba5629f00384694c7aa289db734d4ab74253..4576cfd865bb3392ea01ff22bb521c7a2005c275 100644 --- a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h +++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h @@ -26,6 +26,7 @@ class Graph; class ShuffleChannelDetectPass : public FusePassBase { public: + ShuffleChannelDetectPass(); virtual ~ShuffleChannelDetectPass() {} protected: diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc index dff2f2451dac4ca985c206b7913e42fc563be4c3..282bac4e1634de4a47e573b60a9040abbfc90258 100644 --- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc +++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc @@ -34,6 +34,26 @@ namespace ir { */ class Graph; +SimplifyWithBasicOpsPass::SimplifyWithBasicOpsPass() { + AddOpCompat(OpCompat("scale")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("scale") + .IsNumGE(0.f) + .IsNumLE(1.f) + .End() + .AddAttr("bias") + .IsNumEQ(0.f) + .End() + .AddAttr("bias_after_scale") + .IsNumEQ(true) + .End(); +} + void SimplifyWithBasicOpsPass::ApplyImpl(Graph* graph) const { VLOG(3) << "Simplify the Graph with basic ops."; std::unordered_set del_node_set; @@ -145,6 +165,11 @@ bool SimplifyWithBasicOpsPass::SimplifyDropout( new_op_desc.SetAttr("bias", static_cast(0)); new_op_desc.SetAttr("bias_after_scale", true); + if (!IsCompat(new_op_desc)) { + LOG(WARNING) << "Basic ops pass in scale op compat failed."; + return false; + } + auto* scale_op_node = graph->CreateOpNode(&new_op_desc); IR_NODE_LINK_TO(dropout_x, scale_op_node); IR_NODE_LINK_TO(scale_op_node, dropout_out); diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h index 6a245c444a7ec8dd800d8432693d2fa247360634..e80de5e1cd9d1e51acebab613a1dc543eb354da6 100644 --- a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h +++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h @@ -17,7 +17,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/op_compat_sensible_pass.h" namespace paddle { namespace framework { @@ -26,7 +26,10 @@ namespace ir { class Graph; class Node; -class SimplifyWithBasicOpsPass : public Pass { +class SimplifyWithBasicOpsPass : public OpCompatSensiblePass { + public: + SimplifyWithBasicOpsPass(); + protected: void ApplyImpl(Graph* graph) const override; diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index 232e1d8da4ded39df732912bc86edb9a1fb54317..3c851f13b4d4d5447918945f3adb39b4b9c6c77f 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -129,6 +129,11 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { return; } + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "skip_layernorm pass in op compat failed."; + return; + } + VLOG(4) << "handle SkipLayerNorm fuse"; GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern); diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h index 3a3e50052396a538aebb9027cb444b819129af95..804d0abdd6f06c7c1fbb995907409f0b7fbd3ae2 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.h @@ -33,6 +33,49 @@ class Graph; class SkipLayerNormFusePass : public FusePassBase { public: + SkipLayerNormFusePass() { + AddOpCompat(OpCompat("elementwise_add")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({0, -1}) + .End(); + + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .End() + .AddOutput("Variance") + .IsTensor() + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(0.001f) + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End(); + } + virtual ~SkipLayerNormFusePass() {} protected: diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc index d944da5bc4863048ca2bcbec11f3888191056e78..62f1db426c4821d762fafc32bbe83bea9ddf1d0d 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc @@ -298,7 +298,8 @@ PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, return last_out_var; } -static int BuildFusion(Graph* graph, const std::string& name_scope) { +static int BuildFusion(Graph* graph, const std::string& name_scope, + const SquaredMatSubFusePass* pass) { GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); @@ -320,6 +321,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { LOG(INFO) << "handle sqaure mat sub fuse"; + if (!pass->IsAcceptable(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } + auto& fused_pattern = gpd.pattern(); auto* matx = retrieve_node(name_scope + "/x", subgraph, fused_pattern); @@ -368,14 +374,109 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { GraphSafeRemoveNodes(graph, marked_nodes); ++fusion_count; }; - gpd(graph, handler); return fusion_count; } +SquaredMatSubFusePass::SquaredMatSubFusePass() { + AddOpCompat(OpCompat("square")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End(); + + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumGE(0.99f) + .IsNumLE(1.01f) + .End() + .AddAttr("transpose_X") + .IsBoolEQ(false) + .End() + .AddAttr("transpose_Y") + .IsBoolEQ(false) + .End(); + + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsBoolEQ(false) + .End() + .AddAttr("trans_y") + .IsBoolEQ(false) + .End(); + + AddOpCompat(OpCompat("elementwise_sub")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 0}) + .End(); + + AddOpCompat(OpCompat("elementwise_mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({-1, 0}) + .End(); + + AddOpCompat(OpCompat("fill_constant")) + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("dtype") + .IsNumGE(0) + .IsNumLE(25) + .End() + .AddAttr("shape") + .End() + // type:float,there is no restriction + .AddAttr("value") + .End(); +} + +// to use IsCompat +bool SquaredMatSubFusePass::IsAcceptable( + const GraphPatternDetector::subgraph_t& subgraph, Graph* g) const { + return IsCompat(subgraph, g); +} + void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const { FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph, name_scope_); + int fusion_count = BuildFusion(graph, name_scope_, this); AddStatis(fusion_count); } diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h index 90def957df4bf0907a306798fbb1e9ba53c37919..fcc5b309157f082b1ccfaa4011f1ee78bd22f7ef 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h @@ -31,11 +31,13 @@ class Graph; class SquaredMatSubFusePass : public FusePassBase { public: + SquaredMatSubFusePass(); + bool IsAcceptable(const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) const; virtual ~SquaredMatSubFusePass() {} protected: void ApplyImpl(ir::Graph* graph) const override; - const std::string name_scope_{"squared_mat_sub_fuse"}; }; diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index 50d6b97bbea8ef5508f8bfaa8f84717cecb375f4..523c2161326466eac21e89d9b5442c16138e967a 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -19,7 +19,50 @@ namespace paddle { namespace framework { namespace ir { -void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) { +TransposeFlattenConcatFusePass::TransposeFlattenConcatFusePass() { + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") + .IsType>() + .End(); + AddOpCompat(OpCompat("flatten2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") + .IsNumGE(0) + .End(); + AddOpCompat(OpCompat("concat")) + .AddInput("X") // Input("X"): vector + .End() + .AddInput("AxisTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axis") + .IsIntIn({0, 1}) + .End(); +} + +void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse( + ir::Graph *graph, int times) const { const std::string pattern_name = "transpose_flatten" + std::to_string(times) + "_concat_fuse"; @@ -37,6 +80,10 @@ void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) { auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, Graph *g) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } const int kNumFields = 5; const int kTransOffset = 1; const int kTransOutOffset = 2; diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h index 939a8c31e5501e23968f9b44b4fe09e78280fd07..7c3ef2986e27e0656b3722bc5cb1c77d98190d62 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h @@ -16,7 +16,6 @@ #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -28,10 +27,14 @@ namespace ir { // structure. class TransposeFlattenConcatFusePass : public FusePassBase { public: + TransposeFlattenConcatFusePass(); virtual ~TransposeFlattenConcatFusePass() {} protected: void ApplyImpl(ir::Graph* graph) const override; + + private: + void RunTransposeFlattenConcatFuse(ir::Graph* graph, int times) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.cc b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.cc index dc97e8c0233a60cfe789e33e63782d94ced907e9..d53431d260eaffd07ea8141b40a58b5df000ac63 100644 --- a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.cc +++ b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.cc @@ -73,6 +73,46 @@ PDNode *UnsqueezeEltwise::operator()(PDNode *x, PDNode *y) { } // namespace patterns +UnsqueezeEltwiseFusePass::UnsqueezeEltwiseFusePass() { + AddOpCompat(OpCompat("unsqueeze2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("AxesTensor") + .IsOptional() + .IsTensor() + .End() + .AddInput("AxesTensorList") + .IsOptional() + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("axes") + .IsType>() + .End(); + + AddOpCompat(OpCompat("elementwise_mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + // The attribute value is - 1 before fusion and 0 after fusion + .AddAttr("axis") + .IsIntIn({-1, 0}) + .End(); +} + void UnsqueezeEltwiseFusePass::ApplyImpl(ir::Graph *graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::PreconditionNotMet("graph should not be null.")); @@ -100,7 +140,10 @@ void UnsqueezeEltwiseFusePass::ApplyImpl(ir::Graph *graph) const { LOG(WARNING) << "The subgraph is empty."; return; } - + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "Pass in op compat failed."; + return; + } VLOG(4) << "handle UnsqueezeEltwise fuse"; GET_IR_NODE_FROM_SUBGRAPH(eltwise_op, elementwise, fused_pattern); GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, fused_pattern); @@ -123,6 +166,10 @@ void UnsqueezeEltwiseFusePass::ApplyImpl(ir::Graph *graph) const { IR_NODE_LINK_TO(eltwise_op, eltwise_out); GraphSafeRemoveNodes(graph, {unsqz_op, unsqz_out}); found_subgraph_count++; + if (!IsCompat(*eltwise_op->Op())) { + LOG(WARNING) << "unsqueeze2_eltwise_fuse_pass op compat failed."; + return; + } } }; diff --git a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h index 3be29f0e0288855e3f7e940c527f80b66edccca9..0410e5b3f330cdf4f20df6b9b17e661e1a699b6c 100644 --- a/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h +++ b/paddle/fluid/framework/ir/unsqueeze2_eltwise_fuse_pass.h @@ -34,6 +34,7 @@ class Graph; // it maybe change in runtime. class UnsqueezeEltwiseFusePass : public FusePassBase { public: + UnsqueezeEltwiseFusePass(); virtual ~UnsqueezeEltwiseFusePass() {} protected: diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 0a6b5e44452fe191fce5fea058194a92e3a406de..69a2a6eefaf8ca51d62842e62a6a731c6cbd3231 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -276,7 +276,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor) { SerializeToStream(os, tensor, *dev_ctx); } -void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) { +void DeserializeFromStream(std::istream &os, LoDTensor *tensor) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext *dev_ctx; dev_ctx = pool.Get(platform::CPUPlace()); diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 6b357aba1c5f9a4c0db53b20a9d47e64b71d0a11..7dee0f44e384d4eda9ccb9507f62527a7795b221 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -257,7 +257,7 @@ LoD ConvertToOffsetBasedLoD(const LoD& length_lod); void SerializeToStream(std::ostream& os, const LoDTensor& tensor); -void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor); +void DeserializeFromStream(std::istream& os, LoDTensor* tensor); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index 198bb65863bb6a18d341128b61fa35e4359cdc26..c0ccc196348a5761ea4dedf1aab5ce8754eb74b5 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -112,6 +112,8 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, #ifdef PADDLE_WITH_HETERPS workers_[i]->SetPlace(places_[i]); workers_[i]->SetReaderPlace(places_[i]); + workers_[i]->SetDeviceContext( + platform::DeviceContextPool::Instance().Get(places_[i])); #else workers_[i]->SetPlace(place); workers_[i]->SetReaderPlace(place); @@ -176,6 +178,7 @@ void MultiTrainer::Run() { #ifdef PADDLE_WITH_HETERPS void MultiTrainer::MergeDenseParam() { +#ifdef PADDLE_WTIH_PSCORE auto communicator = paddle::distributed::Communicator::GetInstance(); auto& recv_ctx = communicator->GetRecvCtxMap(); Scope* thread_scope = workers_[0]->GetThreadScope(); @@ -189,6 +192,7 @@ void MultiTrainer::MergeDenseParam() { TensorCopy((*tensor), root_tensor->place(), root_tensor); } } +#endif } #endif diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index f107321958ba7be4d3ba31bd128f0cbbad694b85..7d55d8c41e3e92349dc9986b3d236db2ebdac01b 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -128,7 +128,7 @@ NaiveExecutor::~NaiveExecutor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_); + ClearMKLDNNCache(place_, this); #endif } diff --git a/paddle/fluid/framework/op_def.proto b/paddle/fluid/framework/op_def.proto new file mode 100644 index 0000000000000000000000000000000000000000..7c4b42b1344b8b236078de694b67e05d983ed2a9 --- /dev/null +++ b/paddle/fluid/framework/op_def.proto @@ -0,0 +1,43 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; + +import "framework.proto"; +package paddle.framework.proto; + +message OpDef { + + message VarDef { + required string name = 1; + + // For the type of input / output variables. + reserved 2; + } + + message AttrDef { + required string name = 1; + required AttrType type = 2; + } + + message Desc { + repeated VarDef inputs = 1; + repeated VarDef outputs = 2; + repeated AttrDef attrs = 3; + } + + required string type = 1; + required Desc def = 2; + optional Desc extra = 3; +} diff --git a/paddle/fluid/framework/op_def_api.cc b/paddle/fluid/framework/op_def_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..73f1409ae690e1eecdb3078d943bf9fd495e7106 --- /dev/null +++ b/paddle/fluid/framework/op_def_api.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif +#include "paddle/fluid/framework/op_def_api.h" +#include +#include +#include +#include +#ifdef _LINUX +#include +#include +#include +#endif +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/op_def.pb.h" + +/* +// op_def.pbtxt +namespace { + const std::unordered_map op_def_map = {...}; +} +*/ +#include "paddle/fluid/framework/op_def.pbtxt" //NOLINT + +namespace paddle { +namespace framework { + +const proto::OpDef& GetOpDef(const std::string& op_name) { + static std::unordered_map ops_definition; + static std::mutex mtx; + if (ops_definition.find(op_name) == ops_definition.end()) { + std::lock_guard lk(mtx); + if (ops_definition.find(op_name) == ops_definition.end()) { + proto::OpDef op_def; + if (op_def_map.find(op_name) == op_def_map.end()) { + LOG(WARNING) << op_name << ".pbtxt not exist!"; + } else { + if (!::google::protobuf::TextFormat::ParseFromString( + op_def_map.at(op_name), &op_def)) { + LOG(WARNING) << "Failed to parse " << op_name; + } + } + if (op_def.type() != op_name) { + LOG(WARNING) << op_name << ".pbtxt has error type :" << op_def.type(); + ops_definition.emplace(std::make_pair(op_name, proto::OpDef())); + } else { + ops_definition.emplace(std::make_pair(op_name, std::move(op_def))); + } + } + } + return ops_definition.at(op_name); +} + +bool HasOpDef(const std::string& op_name) { + return op_def_map.find(op_name) != op_def_map.end(); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/distributed/table/graph_edge.cc b/paddle/fluid/framework/op_def_api.h similarity index 67% rename from paddle/fluid/distributed/table/graph_edge.cc rename to paddle/fluid/framework/op_def_api.h index cc90f4c6516c1873b078b96c550d0d52ac5d3b9c..1ef2254d0da361915f29b713e2d9a53d5c35cb8a 100644 --- a/paddle/fluid/distributed/table/graph_edge.cc +++ b/paddle/fluid/framework/op_def_api.h @@ -12,18 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/distributed/table/graph_edge.h" -#include -namespace paddle { -namespace distributed { +#pragma once -void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { - id_arr.push_back(id); -} +#include "paddle/fluid/framework/op_def.pb.h" -void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { - id_arr.push_back(id); - weight_arr.push_back(weight); -} +namespace paddle { +namespace framework { +const proto::OpDef& GetOpDef(const std::string& op_name); + +bool HasOpDef(const std::string& op_name); } } diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 0b9fd0a47e22c76fa9612d4e0ff3632448197a98..8fbea51584d3cad5de7d30537df07f6c676f1cf1 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -66,6 +66,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, op_checker_ = attr_checker; Make(); op_checker_->RecordExplicitCheckerNum(); + op_checker_->InitDefaultAttributeMap(); AddAttr(OpRoleAttrName(), "The role of this operator") .InEnum( diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 818da7478b2392841d0b1b7221270b6f840465ec..348ca5b952bfeab364a5b01ec99e4d0381ab4e84 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -25,7 +25,8 @@ limitations under the License. */ #include #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" // For VLOG() +#include "gflags/gflags.h" +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/grad_op_desc_maker.h" @@ -67,6 +68,8 @@ class Version; } // namespace framework } // namespace paddle +DECLARE_bool(check_kernel_launch); + namespace paddle { namespace framework { @@ -134,6 +137,19 @@ class OpRegistry { static std::unique_ptr CreateOp(const OpDesc& op_desc); }; +template +inline void CheckKernelLaunch(const char* op_type) {} + +#ifdef PADDLE_WITH_CUDA +template <> +inline void CheckKernelLaunch<::paddle::platform::CUDAPlace>( + const char* op_type) { + if (FLAGS_check_kernel_launch) { + PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(op_type); + } +} +#endif + template struct OpKernelRegistrarFunctor; @@ -162,8 +178,9 @@ struct OpKernelRegistrarFunctor { RegisterKernelClass( op_type, library_type, customized_type_value, - [](const framework::ExecutionContext& ctx) { + [op_type](const framework::ExecutionContext& ctx) { KERNEL_TYPE().Compute(ctx); + CheckKernelLaunch(op_type); }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor @@ -223,8 +240,13 @@ struct OpKernelRegistrarFunctorEx(op_type, library_type, - customized_type_value, Functor()); + RegisterKernelClass( + op_type, library_type, customized_type_value, + + [op_type](const framework::ExecutionContext& ctx) { + Functor()(ctx); + CheckKernelLaunch(op_type); + }); constexpr auto size = std::tuple_size>::value; @@ -295,8 +317,12 @@ struct OpKernelRegistrarFunctorExGetPlace(); + } else if (SupportNPU()) { + expected_kernel_key.place_ = dev_ctx->GetPlace(); } else { expected_kernel_key.place_ = platform::CPUPlace(); LOG_FIRST_N(WARNING, 1) @@ -1299,7 +1301,11 @@ void OperatorWithKernel::TransferInplaceVarsBack( auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto original_dims = original_tensor->dims(); original_tensor->ShareDataWith(*transformed_tensor); - original_tensor->Resize(original_dims); + // In order to solve the problem that the output latitude of NPU reshape + // operator is not changed when inplace. + if (type_ != "reshape2" && type_ != "reshape2_grad") { + original_tensor->Resize(original_dims); + } } } @@ -1525,7 +1531,12 @@ Scope* OperatorWithKernel::PrepareData( // the rest iterations to save the elapsed time. // We do not support skipping PrepareData in while block, because the Op's // input may be changed by subsequent Ops, which may cause an error. - if (pre_scope_ == &scope && new_scope == nullptr) { + + // For inference, ops that behind conditional branch aren't supported well, + // so disable prepare optimization conservatively. + bool force_prepare_data = HasAttr("inference_force_prepare_data") && + Attr("inference_force_prepare_data"); + if (pre_scope_ == &scope && new_scope == nullptr && !force_prepare_data) { need_prepare_data_ = false; } @@ -1549,10 +1560,10 @@ void OperatorWithKernel::ParseInputDataType( } else if (var->IsType()) { t = &(var->Get().value()); } else if (var->IsType()) { - auto t_arr = var->Get(); - for (size_t j = 0; j < t_arr.size(); j++) { - if (t_arr[j].IsInitialized()) { - t = &(t_arr[j]); + auto t_arr = &var->Get(); + for (size_t j = 0; j < t_arr->size(); j++) { + if (t_arr->at(j).IsInitialized()) { + t = &(t_arr->at(j)); } } } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 3fc61581eca720f64d4b19fd70b9b619cea9fcef..fc01513a866e414d401a2c244c7523599a5451ea 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -154,6 +154,7 @@ class OperatorBase { std::string DebugString() const { return DebugStringEx(nullptr); } virtual bool SupportGPU() const { return false; } + virtual bool SupportNPU() const { return false; } const std::string& Type() const { return type_; } @@ -490,6 +491,13 @@ class OperatorWithKernel : public OperatorBase { return platform::is_gpu_place(kern_pair.first.place_); }); } + bool SupportNPU() const override { + auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); + return std::any_of(op_kernels.begin(), op_kernels.end(), + [](OpKernelMap::const_reference kern_pair) { + return platform::is_npu_place(kern_pair.first.place_); + }); + } bool SupportsMKLDNN(proto::VarType::Type data_type) const; bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 73a699b41c8e010a72904e1c3bf8b405c8967754..eb021609e825839825b657ef516a18c5b4cbcc74 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -1407,10 +1407,23 @@ std::vector ParallelExecutor::CreateSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, member_->places_, graph)); } else { - VLOG(3) << "use FastThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph)); + if (member_->use_device_ == p::kXPU) { +#if defined(PADDLE_WITH_XPU) + VLOG(3) << "use BindThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::BindThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, + member_->local_exec_scopes_, member_->places_, graph)); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use XPU device since it's not compiled with XPU," + "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else { + VLOG(3) << "use FastThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, + member_->local_exec_scopes_, member_->places_, graph)); + } } final_graphs.emplace_back(graph); } diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index cdd2dbd5b1d2de1dcdeccf6e71fce6641680a4e9..42577972e9b79d2dcfdf692afdec19b3ab576c90 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -13,7 +13,7 @@ // limitations under the License. #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -35,9 +35,9 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); -#if (defined PADDLE_WITH_NCCL) +#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL) place_ = platform::CUDAPlace(place_id); -#elif (defined WITH_ASCEND_CL) // NOLINT +#elif (defined PADDLE_WITH_ASCEND_CL) // NOLINT place_ = platform::NPUPlace(place_id); #endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( @@ -113,19 +113,28 @@ void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, this_worker->SetRootScope(root_scope_); this_worker->SetMinibatchScope(minibatch_scope_); this_worker->SetMicrobatchScopes(microbatch_scopes_); + this_worker->PrepareUnusedVar(); } void PipelineTrainer::Run() { VLOG(5) << "Going to run PipelineTrainer::Run()"; - section_thread_ = std::async(&DeviceWorker::TrainFiles, worker_.get()); -} - -void PipelineTrainer::Finalize() { try { - section_thread_.get(); + worker_->TrainFiles(); } catch (platform::EOFException& e) { std::rethrow_exception(std::current_exception()); } + for (auto* micro_scop : microbatch_scopes_) { + // By default, we should delete all kid scopes after run executor because + // some operators may create local scope when running, such as while_op. + // But when while_op also create a local executor to run it's sub block, + // the sub scopes it created should not be dropped immediately, because + // while_grad_op will use some variables created during while_op run, so + // we need to keep the kids and wait for the outer executor to drop them. + micro_scop->DropKids(); + } +} + +void PipelineTrainer::Finalize() { if (need_dump_field_) { FinalizeDumpEnv(); } diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index d178c4e556ca5773b864ff79fc7fb2d7fe6f8482..66d8a40dda160752e64eae8775a2045509e575e3 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker_factory.h" -#include "paddle/fluid/framework/fleet/heter_wrapper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/string/string_helper.h" @@ -129,8 +128,6 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { } } } - // pull_queue_ = paddle::framework::MakeChannel>(); - // push_queue_ = paddle::framework::MakeChannel>(); } void PSGPUWorker::SetChannelWriter(ChannelObject* queue) { diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 7860b69313e7b2270722abdabe5e922e2fabeac8..a7e84b34b2436bf60d1af19f4f128597250d5033 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -10,7 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) #include #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" @@ -96,12 +96,16 @@ void SectionWorker::RunUpdate( } } +void SectionWorker::PrepareUnusedVar() { + VLOG(5) << "begin prepare the unsed vars"; + unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_); +} + void SectionWorker::TrainFiles() { VLOG(5) << "begin section_worker TrainFiles"; int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - auto unused_vars_ = GetUnusedVars(program_->Block(0), ops_, skip_vars_); if (max_memory_size >= 0) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (platform::is_gpu_place(place_)) { @@ -110,8 +114,22 @@ void SectionWorker::TrainFiles() { BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); } } +#elif defined(PADDLE_WITH_ASCEND_CL) + if (IsFastEagerDeletionModeEnabled()) { + VLOG(4) << "Use unsafe fast gc for NPU."; + gc.reset(new NPUUnsafeFastGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Please set FLAGS_fast_eager_deletion_mode=true to use " + "GarbageCollector on NPU.")); + // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. + VLOG(4) << "Use default stream gc for NPU."; + gc.reset(new NPUDefaultStreamGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } #endif - } + } // max_memory_size >= 0 if (schedule_mode_ == 0) { // F-then-B scheduler which runs Forward phase for all microbatches, diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 7e48d0dc5f96203c4bc89f954b82dfa582eddbc9..c67653953f8a76f8b848bc13efda6fcb23f965da 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -121,7 +121,7 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { SerializeToStream(os, selected_rows, *dev_ctx); } -void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) { +void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; dev_ctx = pool.Get(platform::CPUPlace()); diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index e53e3d973c524657a7b579d96d0f51a39ba40f12..3e4beb9498cf777f91899cd09c8dbb27835a20c2 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -175,7 +175,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); -void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows); +void DeserializeFromStream(std::istream& os, SelectedRows* selected_rows); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index b304a45be3cdcc5defaca9e87d0aa291d09faceb..4f6eb803d1c26e8c0769ad8bbe0ee02133df7cbe 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -135,6 +135,49 @@ Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const { } } +std::vector Tensor::Split(int64_t split_size, int64_t axis) const { + check_memory_size(); + PADDLE_ENFORCE_GE(dims_.size(), 0, + platform::errors::OutOfRange( + "split expects at least a 1-dimensional tensor")); + PADDLE_ENFORCE_GE( + split_size, 0, + platform::errors::OutOfRange( + "split expects split_size be non-negative, but got split_size is %d", + split_size)); + int64_t numel_size = dims_[axis]; + + int64_t num_splits = 1; + if (split_size != 0) { + num_splits = + std::max((numel_size + split_size - 1) / split_size, 1); + } + + std::vector splits(num_splits); + int64_t last_split_size = split_size - (split_size * num_splits - numel_size); + + for (int64_t i = 0; i < num_splits; ++i) { + int64_t length = i < num_splits - 1 ? split_size : last_split_size; + splits[i] = Slice(i * split_size, i * split_size + length); + } + return splits; +} + +std::vector Tensor::Chunk(int64_t chunks, int64_t axis) const { + check_memory_size(); + PADDLE_ENFORCE_GE(dims_.size(), 0, + platform::errors::OutOfRange( + "split expects at least a 1-dimensional tensor")); + PADDLE_ENFORCE_GE( + chunks, 0, + platform::errors::OutOfRange( + "chunks expects to be greater than 0, but got chunks is %d", chunks)); + + int64_t numel_size = dims_[axis]; + int64_t split_size = (numel_size + chunks - 1) / chunks; + return Split(split_size, axis); +} + Tensor& Tensor::Resize(const DDim& dims) { dims_ = dims; return *this; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 0747321bcfa492e01c324954f168ff66426d1347..539859c45c9076c1787977ad4b0223c648efbd11 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -187,6 +187,22 @@ class Tensor { */ Tensor Slice(int64_t begin_idx, int64_t end_idx) const; + /** + * @brief Return a tensor list of the given tensor. + * + * @param[in] split_size The size of tensor to be split along axis. + * @param[in] axis The axis along which to split. + */ + std::vector Split(int64_t split_size, int64_t axis) const; + + /** + * @brief Return a tensor list of the given tensor. + * + * @param[in] chunks The number of tensor to be split along axis. + * @param[in] axis The axis along which to split. + */ + std::vector Chunk(int64_t chunks, int64_t axis) const; + const platform::Place& place() const { PADDLE_ENFORCE_NOT_NULL( holder_, diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 101463756c0a5143536362c706ae08333673c831..71ff50c92ca59f4ac11bf900ad06d4053f6decaf 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -337,3 +337,129 @@ TEST(Tensor, FP16) { // Tensor holds the wrong type, it holds N6paddle8platform7float16E at // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43] } + +TEST(Tensor, Split) { + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 2}), + platform::CPUPlace()); + std::vector split_tensor_list = src_tensor.Split(2, 0); + ASSERT_EQ(split_tensor_list.size(), 3UL); + EXPECT_EQ(split_tensor_list[0].dims()[0], 2); + EXPECT_EQ(split_tensor_list[1].dims()[0], 2); + EXPECT_EQ(split_tensor_list[2].dims()[0], 2); + EXPECT_EQ(split_tensor_list[0].dims()[1], 2); + EXPECT_EQ(split_tensor_list[1].dims()[1], 2); + EXPECT_EQ(split_tensor_list[2].dims()[1], 2); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = reinterpret_cast( + src_tensor.mutable_data(src_tensor.dims(), platform::CPUPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + for (int i = 0; i < 3; ++i) { + uintptr_t split_data_address = + reinterpret_cast(split_tensor_list[i].data()); + uintptr_t split_mutable_data_address = + reinterpret_cast(split_tensor_list[i].mutable_data( + split_tensor_list[i].dims(), platform::CPUPlace())); + EXPECT_EQ(split_data_address, split_mutable_data_address); + EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address); + } + } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 4}), + platform::CUDAPlace(0)); + std::vector split_tensor_list = src_tensor.Split(2, 0); + ASSERT_EQ(split_tensor_list.size(), 3UL); + EXPECT_EQ(split_tensor_list[0].dims()[0], 2); + EXPECT_EQ(split_tensor_list[1].dims()[0], 2); + EXPECT_EQ(split_tensor_list[2].dims()[0], 2); + EXPECT_EQ(split_tensor_list[0].dims()[1], 4); + EXPECT_EQ(split_tensor_list[1].dims()[1], 4); + EXPECT_EQ(split_tensor_list[2].dims()[1], 4); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace(0))); + EXPECT_EQ(src_data_address, src_mutable_data_address); + for (int i = 0; i < 3; ++i) { + uintptr_t split_data_address = + reinterpret_cast(split_tensor_list[i].data()); + uintptr_t split_mutable_data_address = + reinterpret_cast(split_tensor_list[i].mutable_data( + split_tensor_list[i].dims(), platform::CUDAPlace(0))); + EXPECT_EQ(split_data_address, split_mutable_data_address); + EXPECT_EQ(src_data_address + 2 * 4 * i * sizeof(double), + split_data_address); + } + } +#endif +} + +TEST(Tensor, Chunk) { + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 2}), + platform::CPUPlace()); + std::vector split_tensor_list = src_tensor.Chunk(3, 0); + ASSERT_EQ(split_tensor_list.size(), 3UL); + EXPECT_EQ(split_tensor_list[0].dims()[0], 2); + EXPECT_EQ(split_tensor_list[1].dims()[0], 2); + EXPECT_EQ(split_tensor_list[2].dims()[0], 2); + EXPECT_EQ(split_tensor_list[0].dims()[1], 2); + EXPECT_EQ(split_tensor_list[1].dims()[1], 2); + EXPECT_EQ(split_tensor_list[2].dims()[1], 2); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = reinterpret_cast( + src_tensor.mutable_data(src_tensor.dims(), platform::CPUPlace())); + for (int i = 0; i < 3; ++i) { + uintptr_t split_data_address = + reinterpret_cast(split_tensor_list[i].data()); + uintptr_t split_mutable_data_address = + reinterpret_cast(split_tensor_list[i].mutable_data( + split_tensor_list[i].dims(), platform::CPUPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(split_data_address, split_mutable_data_address); + EXPECT_EQ(src_data_address + 2 * 2 * i * sizeof(int), split_data_address); + } + } +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 4}), + platform::CUDAPlace(0)); + std::vector split_tensor_list = src_tensor.Chunk(3, 0); + ASSERT_EQ(split_tensor_list.size(), 3UL); + EXPECT_EQ(split_tensor_list[0].dims()[0], 2); + EXPECT_EQ(split_tensor_list[1].dims()[0], 2); + EXPECT_EQ(split_tensor_list[2].dims()[0], 2); + EXPECT_EQ(split_tensor_list[0].dims()[1], 4); + EXPECT_EQ(split_tensor_list[1].dims()[1], 4); + EXPECT_EQ(split_tensor_list[2].dims()[1], 4); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace(0))); + EXPECT_EQ(src_data_address, src_mutable_data_address); + for (int i = 0; i < 3; ++i) { + uintptr_t split_data_address = + reinterpret_cast(split_tensor_list[i].data()); + uintptr_t split_mutable_data_address = + reinterpret_cast(split_tensor_list[i].mutable_data( + split_tensor_list[i].dims(), platform::CUDAPlace(0))); + EXPECT_EQ(split_data_address, split_mutable_data_address); + EXPECT_EQ(src_data_address + 2 * 4 * i * sizeof(double), + split_data_address); + } + } +#endif +} diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 78fd1af09e29458ec84549c55dd99f8c29da29db..d2616da7a127da8c5e7b204c5216d31ad8933d97 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -22,9 +22,11 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_MKLDNN +#include "dnnl_debug.h" +#endif namespace paddle { namespace framework { @@ -61,6 +63,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } + VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr; #ifdef PADDLE_WITH_MKLDNN auto size = src.layout() == DataLayout::kMKLDNN @@ -278,7 +281,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); const platform::DeviceContext* dev_ctx; - if (platform::is_gpu_place(dst_place)) { + if (platform::is_gpu_place(dst_place) || platform::is_npu_place(dst_place)) { dev_ctx = pool.Get(dst_place); } else { dev_ctx = pool.Get(src.place()); @@ -503,6 +506,11 @@ class AnyVisitor : public boost::static_visitor { // return GetResultHelper(out, npu); } + bool GetResult(const framework::Tensor& out, + const platform::NPUPinnedPlace& cpu) const { + return *out.data(); + } + bool GetResult(const framework::Tensor& out, const platform::CPUPlace& cpu) const { return *out.data(); @@ -731,6 +739,18 @@ struct BothFalseVisitor : public boost::static_visitor<> { out_ptr[i] = lhs && rhs; } } + + void VisitorImpl( + const platform::NPUPinnedPlace& cpu /* equals to cpu*/) const { + int num = in_.numel(); + const bool* in_ptr = in_.data(); + bool* out_ptr = out_->data(); + for (int i = 0; i < num; ++i) { + bool lhs = !in_ptr[i]; + bool rhs = !out_ptr[i]; + out_ptr[i] = lhs && rhs; + } + } }; void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) { @@ -1120,9 +1140,9 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) { } template <> -std::ostream& print_tensor( +std::ostream& print_tensor>( std::ostream& os, const framework::Tensor& tensor) { - auto inspect = tensor.data(); + auto inspect = tensor.data>(); auto element_num = tensor.numel(); os << " - data: ["; @@ -1138,9 +1158,9 @@ std::ostream& print_tensor( } template <> -std::ostream& print_tensor( +std::ostream& print_tensor>( std::ostream& os, const framework::Tensor& tensor) { - auto inspect = tensor.data(); + auto inspect = tensor.data>(); auto element_num = tensor.numel(); os << " - data: ["; @@ -1160,6 +1180,11 @@ std::ostream& operator<<(std::ostream& os, const Tensor& t) { os << " - shape: [" << t.dims() << "]\n"; os << " - layout: " << DataLayoutToString(t.layout()) << "\n"; +#ifdef PADDLE_WITH_MKLDNN + os << " - format: " + << dnnl_fmt_tag2str(static_cast(t.format())) << "\n"; +#endif + Tensor tensor; tensor.Resize(t.dims()); if (platform::is_cpu_place(t.place())) { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 22c8e1c1665f121cda6ba33f23cb7fc0749da025..15c478e531e9c756bdb4296bbc64e65aab331828 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -19,6 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -166,8 +170,30 @@ void TensorFromVector(const std::vector& src, // Since vector is on cpu, I think this function should be a "sync" operation, // so pass nullptr as stream to memory::Copy(). else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - src_place, src_ptr, size, nullptr); + // 1. vector -> npu pinned tensor + Tensor npu_pinned_tensor(dst->type()); + platform::NPUPinnedPlace npu_pinned_place; + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data(dst->dims(), npu_pinned_place); + memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); + + // 2. async copy npu pinned tensor -> npu tensor + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + npu_pinned_place, npu_pinned_ptr, size, + reinterpret_cast(ctx).stream()); + + // 3. record event + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation* allocation = + npu_pinned_tensor.Holder().get(); + npu_pinned_allocator->RecordEvent( + allocation, + reinterpret_cast(ctx).stream()); } #endif } @@ -206,8 +232,31 @@ inline void TensorFromVector(const std::vector& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - src_place, src_ptr, size, nullptr); + // 1. vector -> npu pinned tensor + platform::NPUPinnedPlace npu_pinned_place; + Tensor npu_pinned_tensor; + npu_pinned_tensor.Resize(dst->dims()); + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data(npu_pinned_place, dst->type()); + memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); + + // 2. async copy npu pinned tensor -> npu tensor + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + npu_pinned_place, npu_pinned_ptr, size, + reinterpret_cast(ctx).stream()); + + // 3. record event + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation* allocation = + npu_pinned_tensor.Holder().get(); + npu_pinned_allocator->RecordEvent( + allocation, + reinterpret_cast(ctx).stream()); } #endif delete[] array; diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 01aa07e618464db05aa5c4bf322ec78aac110e1b..fc8fb9327d5bb2d2a3627f7fd463d48efb9a514f 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -27,8 +27,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/fleet/heter_context.h" -#include "paddle/fluid/framework/fleet/heter_wrapper.h" -#include "paddle/fluid/framework/heter_service.h" +//#include "paddle/fluid/framework/fleet/heter_wrapper.h" +#include "paddle/fluid/framework/heter_util.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/reader.h" @@ -47,6 +47,10 @@ class PullDenseWorker; class Scope; class VarDesc; class DeviceWorker; +class HeterWrapper; +class HeterRequest; +class HeterResponse; + template class ChannelObject; @@ -239,55 +243,6 @@ class HeterXpuTrainer : public TrainerBase { #endif }; -class HeterBoxTrainer : public TrainerBase { - public: - HeterBoxTrainer() {} - virtual ~HeterBoxTrainer() {} - virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set); - virtual void InitTrainerEnv(const ProgramDesc& main_program, - const platform::Place& place); - virtual void InitOtherEnv(const ProgramDesc& main_program); - virtual void Run(); - virtual void Finalize(); - virtual void RegisterHeterCallback(); - virtual void DumpWork(int tid); - virtual Scope* GetWorkerScope(int thread_id); - virtual void CacheProgram(const ProgramDesc& main_program) { - new (&program_) ProgramDesc(main_program); - } - virtual std::string GetDumpPath(int tid) { return ""; } - virtual void InitDumpEnv() {} - template -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor, - const paddle::platform::Place& thread_place, - gpuStream_t stream); -#endif - void CreateThreadParam(const ProgramDesc& program, int num); - template - void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); - - protected: - DownpourWorkerParameter param_; - std::map> dense_grad_names_; - std::vector need_merge_var_names_; - float scale_datanorm_; - paddle::platform::Place place_; - ProgramDesc program_; - std::shared_ptr fleet_ptr_; - std::shared_ptr pull_dense_worker_; - std::vector> workers_; - std::vector places_; - // ps-gpu - std::vector pull_threads_; - std::vector threads_; - int use_ps_gpu_; - int thread_num_; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - std::vector copy_streams_; - std::vector events_; -#endif -}; #endif #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ @@ -333,7 +288,7 @@ class PSGPUTrainer : public TrainerBase { #endif #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(WITH_ASCEND_CL) + defined(PADDLE_WITH_ASCEND_CL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 6b9dbece8974c286a390627f25e4a25ee8bfb8d3..660511b1f268d910629199bd122561a2a24a1b0a 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -70,13 +70,13 @@ REGISTER_TRAINER_CLASS(DistMultiTrainer); defined PADDLE_WITH_XPU) && \ (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(HeterXpuTrainer); -REGISTER_TRAINER_CLASS(HeterBoxTrainer); #endif #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \ (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(PSGPUTrainer); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index e43cccfe648165ce962b779cb513effe990d0ab3..951daea47bde3b9f251c442c07368c17d24b81b5 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -71,6 +71,7 @@ using DygraphGradOpMakerFN = const imperative::NameVarBaseMap& /*var_base_map_in*/, const imperative::NameVarBaseMap& /*var_base_map_out*/, const framework::AttributeMap& /*attributes*/, + const framework::AttributeMap& /*default attributes*/, const std::map& /*inplace_map*/)>; using InferVarTypeFN = diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index 0f8465ab8948e425ec48d10052643699e3c10ce7..f8ace3e85a643e8166da2b2e6f35a8097761b8cd 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -75,6 +75,7 @@ static const std::unordered_set &GetOpWithUnusedVarAllowSet() { "data_norm_grad", // 0 "update_loss_scaling", // 0 "fused_embedding_eltwise_layernorm", // 0 + "trunc_grad", // 1 }); return *allow_set; } diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 6bee3d44b2edd71da6f8554e998f244376d40442..c9dffe2d76a436e9888b91caf10e311e5c771572 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) -cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils) cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index fd2bb6e5c995222cdabedefab93cd696c7c3d9e1..eba30ff8edebf9b4fd0b101c45a13c0a9086e42b 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -33,7 +33,8 @@ AmpOperators::AmpOperators() for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { bool supported = false; for (auto& kernel_type : it->second) { - if (platform::is_gpu_place(kernel_type.first.place_) && + if ((platform::is_gpu_place(kernel_type.first.place_) || + platform::is_xpu_place(kernel_type.first.place_)) && kernel_type.first.data_type_ == fp16_dtype) { supported = true; } @@ -91,7 +92,8 @@ inline std::string GetDtypeStr( inline bool NeedCast(const std::shared_ptr& var) { if (platform::is_gpu_place(var->Place()) || - platform::is_cuda_pinned_place(var->Place())) { + platform::is_cuda_pinned_place(var->Place()) || + platform::is_xpu_place(var->Place())) { // CudaPinndePlace is added for varbase created by dataloader if (var->DataType() == framework::proto::VarType::FP32 || var->DataType() == framework::proto::VarType::FP16) { @@ -141,7 +143,7 @@ static inline std::shared_ptr CastToFP32( } static inline framework::proto::VarType::Type GetPromoteType( - const NameVarBaseMap& ins) { + const std::string& op_type, const NameVarBaseMap& ins) { auto dst_type = framework::proto::VarType::FP16; for (const auto& pair : ins) { for (const auto& var : pair.second) { @@ -151,6 +153,18 @@ static inline framework::proto::VarType::Type GetPromoteType( } } } + + // NOTE(juncai): moving_average_abs_max_scale only consider the + // dtype of input(X) + if (op_type == "moving_average_abs_max_scale") { + for (const auto& pair : ins) { + if (pair.first == "X" && + pair.second.front()->DataType() == framework::proto::VarType::FP16) { + dst_type = framework::proto::VarType::FP16; + } + } + } + return dst_type; } @@ -160,7 +174,8 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, if (AmpOperators::Instance().GetMutableAllowOps()->count(op_type)) { for (auto& pair : new_ins) { // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16. - if ((op_type == "batch_norm" || op_type == "layer_norm") && + if ((op_type == "batch_norm" || op_type == "layer_norm" || + op_type == "sync_batch_norm") && pair.first != "X") { continue; } @@ -182,7 +197,8 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, } return new_ins; } else { - auto dst_type = GetPromoteType(ins); + auto dst_type = GetPromoteType(op_type, ins); + // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32. if (dst_type == framework::proto::VarType::FP16 && AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count( @@ -191,7 +207,8 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, } for (auto& pair : new_ins) { // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16. - if ((op_type == "batch_norm" || op_type == "layer_norm") && + if ((op_type == "batch_norm" || op_type == "layer_norm" || + op_type == "sync_batch_norm") && pair.first == "X" && dst_type == framework::proto::VarType::FP32) { continue; } diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index d5350744e4c55384e14e4ff5f06bc90abed87ce2..84ee1fbe5df96abc0c47b66a34a6e84e1f9be2b6 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -408,7 +408,8 @@ void BasicEngine::Execute() { VLOG(10) << "create temporary var of " << var->Name() << " for sum gradient within this graph!"; } else if (!inplace_grad_name_map.empty() && - inplace_grad_name_map.count(pair.first)) { + inplace_grad_name_map.count(pair.first) && + bwd_ins.count(inplace_grad_name_map.at(pair.first))) { // When calculate Inplace grad op, create a new output var. // If a tmp var has been created, there is no need to create it // again. @@ -470,12 +471,21 @@ void BasicEngine::Execute() { { VLOG(3) << "Start to execute grad op " << cur_op.Type(); - if (tmp_ins_ptr == nullptr) { - OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), - cur_op.place()); - } else { - OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(), - cur_op.place()); + try { + if (tmp_ins_ptr == nullptr) { + OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), + cur_op.DefaultAttrsMap(), cur_op.place()); + } else { + OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, + cur_op.Attrs(), cur_op.DefaultAttrsMap(), + cur_op.place()); + } + } catch (platform::EnforceNotMet& exception) { + Clear(); + throw std::move(exception); + } catch (std::exception& ex) { + Clear(); + PADDLE_THROW(platform::errors::External("%s", ex.what())); } } diff --git a/paddle/fluid/imperative/dygraph_grad_maker.h b/paddle/fluid/imperative/dygraph_grad_maker.h index 7fefc9ccc67b52aab5073d3dd6c738ab07075e78..f1eb8aa62c9271b194d5159883392372f4cbd4f3 100644 --- a/paddle/fluid/imperative/dygraph_grad_maker.h +++ b/paddle/fluid/imperative/dygraph_grad_maker.h @@ -113,9 +113,18 @@ class GradOpBaseMakerBase { return vec_temp; } + // Only for dygraph + void SetDygraphDefaultAttrsMap(const framework::AttributeMap& default_attrs) { + default_attrs_ = &default_attrs; + } + + const framework::AttributeMap& DefaultAttrsMap() const { + return *default_attrs_; + } + const framework::AttributeMap& Attrs() const { return attrs_; } - const framework::Attribute& GetAttr(const std::string& name) const { + virtual const framework::Attribute& GetAttr(const std::string& name) const { auto it = attrs_.find(name); PADDLE_ENFORCE_EQ( it != attrs_.end(), true, @@ -199,6 +208,7 @@ class GradOpBaseMakerBase { const NameVarBaseMap& var_base_map_in_; const NameVarBaseMap& var_base_map_out_; const framework::AttributeMap& attrs_; + const framework::AttributeMap* default_attrs_; const std::map& inplace_map_; }; @@ -285,6 +295,10 @@ class TracedGradOp { return op_->SetAttrMap(attrs); } + void SetDefaultAttrsMap(const framework::AttributeMap& attrs) { + return op_->SetDefaultAttrsMap(attrs); + } + void SetAttr(const std::string& name, const framework::Attribute& v) { op_->SetAttr(name, v); } diff --git a/paddle/fluid/imperative/execution_context.h b/paddle/fluid/imperative/execution_context.h index 398b1292e2ffe05beef3fea50c7b676625cab5bd..5446add86788b23c2e002b86e463cc2a2379f04b 100644 --- a/paddle/fluid/imperative/execution_context.h +++ b/paddle/fluid/imperative/execution_context.h @@ -35,11 +35,13 @@ class DygraphExecutionContext : public framework::ExecutionContext { const framework::RuntimeContext& ctx, const NameVarMap& var_base_map_in, const NameVarMap& var_base_map_out, - const framework::AttributeMap& attrs) + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs) : ExecutionContext(op, scope, device_context, ctx), var_base_map_in_(var_base_map_in), var_base_map_out_(var_base_map_out), - attrs_(attrs) {} + attrs_(attrs), + default_attrs_(default_attrs) {} std::string InputName(const std::string& name) const override { auto it = var_base_map_in_.find(name); @@ -92,7 +94,7 @@ class DygraphExecutionContext : public framework::ExecutionContext { } bool HasAttr(const std::string& name) const override { - return attrs_.count(name) != 0; + return attrs_.count(name) != 0 || default_attrs_.count(name) != 0; } const framework::AttributeMap& Attrs() const override { return attrs_; } @@ -100,9 +102,14 @@ class DygraphExecutionContext : public framework::ExecutionContext { const framework::Attribute& GetAttr(const std::string& name) const override { auto it = attrs_.find(name); - PADDLE_ENFORCE_NE( - it, attrs_.end(), - platform::errors::NotFound("can not find [%s] in attrs", name)); + if (it == attrs_.end()) { + it = default_attrs_.find(name); + if (it == default_attrs_.end()) { + PADDLE_THROW(platform::errors::NotFound( + "Can not find [%s] in attributes of op %s.", name, + this->GetOp().Type())); + } + } return it->second; } @@ -192,6 +199,7 @@ class DygraphExecutionContext : public framework::ExecutionContext { const NameVarMap& var_base_map_in_; const NameVarMap& var_base_map_out_; const framework::AttributeMap& attrs_; + const framework::AttributeMap& default_attrs_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 43546cf99c69ffa3aa1f1a792e7b344ed0735a31..57657941ef83f3a3ea0e9e716d49a8b38d22eef8 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -24,8 +24,7 @@ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler.h" @@ -132,6 +131,12 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif + void operator()(const platform::NPUPinnedPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } // there is NO blas in CUDAPinnedPlace void operator()(const platform::CUDAPinnedPlace& place) { PADDLE_THROW(platform::errors::PermissionDenied( @@ -194,8 +199,8 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { PADDLE_TENSOR_ADD(double); // NOTE(chenweihang): only support complex grad tensor accumulated, // support selected rows if needed in the future - PADDLE_TENSOR_ADD(platform::complex64); - PADDLE_TENSOR_ADD(platform::complex128); + PADDLE_TENSOR_ADD(platform::complex); + PADDLE_TENSOR_ADD(platform::complex); #endif #undef PADDLE_TENSOR_ADD diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h index fcd4545a2c82d3c64f8d8d8683438aaf0e6a2719..7efe1177f5dc78d36dce0833fc8ec5fdfc0ed921 100644 --- a/paddle/fluid/imperative/infer_shape_context.h +++ b/paddle/fluid/imperative/infer_shape_context.h @@ -35,10 +35,12 @@ class DygraphInferShapeContext : public framework::InferShapeContext { DygraphInferShapeContext(const NameVarMap* in, const NameVarMap* out, const framework::AttributeMap* attr, + const framework::AttributeMap* default_attr, const std::string op_type) : var_base_map_in_(in), var_base_map_out_(out), attrs_(attr), + default_attrs_(default_attr), op_type_(op_type) {} bool HasInput(const std::string& name) const override { @@ -101,7 +103,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext { } framework::AttrReader Attrs() const override { - return framework::AttrReader(*attrs_); + return framework::AttrReader(*attrs_, *default_attrs_); } std::vector Inputs(const std::string& name) const override { @@ -395,6 +397,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext { const NameVarMap* var_base_map_in_; const NameVarMap* var_base_map_out_; const framework::AttributeMap* attrs_; + const framework::AttributeMap* default_attrs_; const std::string op_type_; }; diff --git a/paddle/fluid/imperative/infer_var_type_context.h b/paddle/fluid/imperative/infer_var_type_context.h index f740507fa508600fd268c8b80e5850497b07ea3d..7defc339f4f81dd9b3efe2104164b3cfabaa2a40 100644 --- a/paddle/fluid/imperative/infer_var_type_context.h +++ b/paddle/fluid/imperative/infer_var_type_context.h @@ -32,20 +32,28 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext { public: RuntimeInferVarTypeContext(const NameVarMap& inputs, const NameVarMap& outputs, - const framework::AttributeMap& attrs_map) + const framework::AttributeMap& attrs_map, + const framework::AttributeMap& default_attrs_map) : InferVarTypeContext(nullptr, nullptr), inputs_(inputs), outputs_(outputs), - attrs_(attrs_map) {} + attrs_(attrs_map), + default_attrs_(default_attrs_map) {} virtual ~RuntimeInferVarTypeContext() {} framework::Attribute GetAttr(const std::string& name) const override { - auto iter = attrs_.find(name); - PADDLE_ENFORCE_EQ( - iter != attrs_.end(), true, - platform::errors::NotFound("Cannot find attribute %s", name)); - return iter->second; + auto it = attrs_.find(name); + + if (it == attrs_.end()) { + it = default_attrs_.find(name); + if (it == default_attrs_.end()) { + PADDLE_THROW(platform::errors::NotFound( + "Can not find [%s] in attributes.", name)); + } + } + + return it->second; } bool HasInput(const std::string& name) const override { @@ -233,6 +241,7 @@ class RuntimeInferVarTypeContext : public framework::InferVarTypeContext { const NameVarMap& inputs_; const NameVarMap& outputs_; const framework::AttributeMap& attrs_; + const framework::AttributeMap& default_attrs_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index a4af3117d3e32ea8db37881bef9c4423ba0173ca..6e28ecd9971abcee51e4c3910896eadae7b01c0a 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -329,6 +329,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, const platform::Place& place) { auto* op_kernel = dynamic_cast(&op); PADDLE_ENFORCE_NOT_NULL( @@ -336,7 +337,8 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, "Only support operator with kernel in Dygraph mode.")); auto& info = op.Info(); if (info.infer_var_type_) { - RuntimeInferVarTypeContext infer_var_type_ctx(ins, outs, attrs); + RuntimeInferVarTypeContext infer_var_type_ctx(ins, outs, attrs, + default_attrs); info.infer_var_type_(&infer_var_type_ctx); } @@ -369,13 +371,14 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, * after the execution of op, but the original input is directly * overwritten in the previous dynamic graph implemention. */ - auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs); + auto prepared_op = + PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs); auto tmp_ins_ptr = PrepareData(*op_kernel, ins, prepared_op.kernel_type()); if (tmp_ins_ptr == nullptr) { - prepared_op.Run(ins, outs, attrs); + prepared_op.Run(ins, outs, attrs, default_attrs); } else { - prepared_op.Run(*tmp_ins_ptr, outs, attrs); + prepared_op.Run(*tmp_ins_ptr, outs, attrs, default_attrs); } VLOG(4) << LayerDebugString(op.Type(), ins, outs); @@ -395,16 +398,18 @@ void OpBase::Run(const framework::OperatorBase& op, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, const platform::Place& place) { - OpBaseRunImpl(op, ins, outs, attrs, place); + OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); } void OpBase::Run(const framework::OperatorBase& op, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, const platform::Place& place) { - OpBaseRunImpl(op, ins, outs, attrs, place); + OpBaseRunImpl(op, ins, outs, attrs, default_attrs, place); } void ClearNoNeedBufferInputs(OpBase* op) { @@ -446,15 +451,15 @@ void ClearNoNeedBufferInputs(OpBase* op) { std::shared_ptr CreateGradOpNode( const framework::OperatorBase& op, const NameVarBaseMap& ins, const NameVarBaseMap& outs, const framework::AttributeMap& attrs, - const platform::Place& place, + const framework::AttributeMap& default_attrs, const platform::Place& place, const std::map& inplace_map) { const auto& info = op.Info(); if (!info.dygraph_grad_op_maker_) { return nullptr; } - auto grad_node = - info.dygraph_grad_op_maker_(op.Type(), ins, outs, attrs, inplace_map); + auto grad_node = info.dygraph_grad_op_maker_(op.Type(), ins, outs, attrs, + default_attrs, inplace_map); if (grad_node && !grad_node->empty()) { for (auto& grad_op : *grad_node) { grad_op.SetId(OpBase::GenerateUniqueId()); diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index bbede47e36429887b70c7a7310176c38f6d41a52..56e16ba199707c37031b55b65057cd95ff5ed805 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -108,7 +108,7 @@ class VarBase { void ClearGradVarBase() { grad_var_ = nullptr; } - void SetGradVarBase(VarBase& grad_var) { + void SetGradVarBase(const VarBase& grad_var) { MutableGradVarBase()->CopyFrom(grad_var, true); } @@ -283,7 +283,7 @@ class Layer { std::shared_ptr CreateGradOpNode( const framework::OperatorBase& op, const NameVarBaseMap& ins, const NameVarBaseMap& outs, const framework::AttributeMap& attrs, - const platform::Place& place, + const framework::AttributeMap& default_attrs, const platform::Place& place, const std::map& inplace_map); void ClearNoNeedBufferInputs(OpBase* op); diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index 0164ff9313cdfe2344f98610602a6bd40a5e903a..acb125a82925d7971b7b03ee90198f87c1a5b9c0 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -50,6 +50,10 @@ class OpBase { const framework::AttributeMap& Attrs() const { return attrs_; } + const framework::AttributeMap& DefaultAttrsMap() const { + return *default_attrs_; + } + const framework::OpInfo& Info() const { PADDLE_ENFORCE_NOT_NULL(op_, platform::errors::PreconditionNotMet( "OpBase::Info() should be called after " @@ -99,6 +103,10 @@ class OpBase { void SetAttrMap(const framework::AttributeMap& attrs) { attrs_ = attrs; } + void SetDefaultAttrsMap(const framework::AttributeMap& default_attrs) { + default_attrs_ = &default_attrs; + } + void SetAttr(const std::string& name, const framework::Attribute& v) { attrs_[name] = v; } @@ -110,14 +118,23 @@ class OpBase { const framework::AttributeMap& Attrs() { return attrs_; } - bool HasAttr(const std::string& name) const { return attrs_.count(name) > 0; } + const framework::AttributeMap& DefaultAttrsMap() { return *default_attrs_; } + + bool HasAttr(const std::string& name) const { + return attrs_.count(name) > 0 || default_attrs_->count(name) > 0; + } const framework::Attribute& GetAttr(const std::string& name) const { auto it = attrs_.find(name); - PADDLE_ENFORCE_NE( - it, attrs_.end(), - platform::errors::NotFound("can not find attribute [%s]", name)); - return it->second; + if (it != attrs_.end()) { + return it->second; + } else { + auto it_default = default_attrs_->find(name); + PADDLE_ENFORCE_NE( + it_default, default_attrs_->end(), + platform::errors::NotFound("can not find attribute [%s]", name)); + return it_default->second; + } } template @@ -156,12 +173,14 @@ class OpBase { const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, const platform::Place& place); static void Run(const framework::OperatorBase& op, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs, const platform::Place& place); private: @@ -174,6 +193,7 @@ class OpBase { NameVarMap ins_; NameVarMap outs_; framework::AttributeMap attrs_; + const framework::AttributeMap* default_attrs_; std::unique_ptr op_; platform::Place place_; size_t id_{-1UL}; diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 3da3a05ed1071cae20cf16ebfed6f6310937daae..84ba60fef80d5f82b4bc45ec71b537608824c8e6 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -73,6 +73,7 @@ static void GetGraphInfoBetweenTargets( std::unordered_map *op_deps_ptr, std::unordered_set *related_grad_vars_ptr, const std::unordered_set &no_grad_var_grad) { + VLOG(10) << "prune graph starts"; /** * Step 1. Find the candidate startup grad ops, prepared for following BFS. */ @@ -117,6 +118,8 @@ static void GetGraphInfoBetweenTargets( auto *op = op_node_pair.first; auto *node = op_node_pair.second; + VLOG(10) << "Visit node " << node << " , visit op " << op->Type(); + for (auto &output_pair : op->GetOutsMap()) { if (!output_pair.second.IsGrad()) { VLOG(10) << "WARNING: " << op->Type() << " outputs a forward var"; @@ -135,6 +138,7 @@ static void GetGraphInfoBetweenTargets( for (auto &pending_node : node->GradPendingNodes()) { if (visited.count(pending_node.get()) == 0) { + visited.insert(pending_node.get()); for (auto &pending_op : *pending_node) { preceding_ops[&pending_op].insert(op); q.emplace(&pending_op, pending_node.get()); @@ -143,6 +147,8 @@ static void GetGraphInfoBetweenTargets( } } + VLOG(10) << "Found endpoint op ends"; + /** * Step 3. Based on the found input_target_grads, BFS the graph in reverse * order. `target_vars` would record all grad vars in the graph, and @@ -246,6 +252,8 @@ static void GetGraphInfoBetweenTargets( } } + VLOG(10) << "Found startup op ends"; + /** * Step 4. Prune output_targets which is not the input of startup_ops */ @@ -884,11 +892,13 @@ void PartialGradTask::RunEachOp(OpBase *op) { } // Run op - OpBase::Run(op->InnerOp(), tmp_ins, tmp_outs, op->Attrs(), op->place()); + OpBase::Run(op->InnerOp(), tmp_ins, tmp_outs, op->Attrs(), + op->DefaultAttrsMap(), op->place()); if (create_graph_) { - auto double_grad_node = CreateGradOpNode(op->InnerOp(), tmp_ins, tmp_outs, - op->Attrs(), op->place(), {}); + auto double_grad_node = + CreateGradOpNode(op->InnerOp(), tmp_ins, tmp_outs, op->Attrs(), + op->DefaultAttrsMap(), op->place(), {}); PADDLE_ENFORCE_NOT_NULL( double_grad_node, platform::errors::NotFound("The Op %s doesn't have any grad op. If you " diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 2a3b6424d4a14e1cd6345cf24594582bd19f51d4..57c6ae3cbb0a136cdb87995096fc8c9b911ea855 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -15,8 +15,11 @@ #include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/imperative/infer_shape_context.h" +DECLARE_bool(check_nan_inf); + namespace paddle { namespace imperative { @@ -88,7 +91,8 @@ PreparedOp PrepareImpl(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, - const framework::AttributeMap& attrs) { + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -105,9 +109,9 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif // 1. get expected kernel key - auto expected_kernel_key = - op.GetExpectedKernelType(DygraphExecutionContext( - op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs)); + auto expected_kernel_key = op.GetExpectedKernelType( + DygraphExecutionContext(op, framework::Scope(), *dev_ctx, ctx, + ins, outs, attrs, default_attrs)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; // 2. check if op[type] has kernel registered. @@ -124,6 +128,19 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #ifdef PADDLE_WITH_XPU if (kernel_iter == kernels.end() && is_xpu_place(expected_kernel_key.place_)) { + VLOG(3) << "missing XPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; + expected_kernel_key.place_ = platform::CPUPlace(); + kernel_iter = kernels.find(expected_kernel_key); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + if (kernel_iter == kernels.end() && + is_npu_place(expected_kernel_key.place_)) { + VLOG(3) << "missing NPU kernel: " << op.Type() + << ", expected_kernel_key:" << expected_kernel_key + << ", fallbacking to CPU one!"; expected_kernel_key.place_ = platform::CPUPlace(); kernel_iter = kernels.find(expected_kernel_key); } @@ -145,16 +162,19 @@ PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, - const framework::AttributeMap& attrs) { - return PrepareImpl(ins, outs, op, place, attrs); + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs) { + return PrepareImpl(ins, outs, op, place, attrs, default_attrs); } PreparedOp PreparedOp::Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, - const framework::AttributeMap& attrs) { - return PrepareImpl(ins, outs, op, place, attrs); + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs) { + return PrepareImpl(ins, outs, op, place, attrs, + default_attrs); } template @@ -163,17 +183,23 @@ static void PreparedOpRunImpl( const framework::OpKernelType& kernel_type, const framework::OperatorWithKernel::OpKernelFunc& func, platform::DeviceContext* dev_ctx, const NameVarMap& ins, - const NameVarMap& outs, const framework::AttributeMap& attrs) { + const NameVarMap& outs, const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs) { // TODO(zjl): remove scope in dygraph framework::Scope scope; DygraphInferShapeContext infer_shape_ctx(&ins, &outs, &attrs, - op.Type()); + &default_attrs, op.Type()); static_cast(op).InferShape( &infer_shape_ctx); func(DygraphExecutionContext(op, scope, *dev_ctx, ctx, ins, outs, - attrs)); + attrs, default_attrs)); + + if (FLAGS_check_nan_inf) { + framework::details::CheckOpHasNanOrInfInDygraph( + op.Type(), outs, dev_ctx->GetPlace()); + } /** * [ Why need handle complex gradient to real gradient? ] @@ -194,16 +220,18 @@ static void PreparedOpRunImpl( void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, - const framework::AttributeMap& attrs) { + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs) { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, ins, - outs, attrs); + outs, attrs, default_attrs); } void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, - const framework::AttributeMap& attrs) { + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs) { PreparedOpRunImpl(op_, ctx_, kernel_type_, func_, dev_ctx_, - ins, outs, attrs); + ins, outs, attrs, default_attrs); } } // namespace imperative diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 1f6be5483be30baf59f5b007f623d997bb041b9c..53f876c498cd04bdacaf18ded5a20f2dac428223 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -151,20 +151,24 @@ class PreparedOp { const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, - const framework::AttributeMap& attrs); + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs); static PreparedOp Prepare(const NameVarMap& ins, const NameVarMap& outs, const framework::OperatorWithKernel& op, const platform::Place& place, - const framework::AttributeMap& attrs); + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs); void Run(const NameVarMap& in, const NameVarMap& out, - const framework::AttributeMap& attrs); + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs); void Run(const NameVarMap& ins, const NameVarMap& outs, - const framework::AttributeMap& attrs); + const framework::AttributeMap& attrs, + const framework::AttributeMap& default_attrs); const framework::OpKernelType& kernel_type() const { return kernel_type_; } diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h index bd132f2576fec14511523958d4ce64077b99b1f1..1baf73ab3b95da869922e5d4745c91356025799e 100644 --- a/paddle/fluid/imperative/py_layer_fwd.h +++ b/paddle/fluid/imperative/py_layer_fwd.h @@ -17,6 +17,7 @@ #include #include #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/prepared_operator.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/framework/op_registry.h" @@ -32,7 +33,17 @@ bool RequiredGrad(const NameVarBaseMap& ins, const NameVarBaseMap& outs) { for (const auto& name_pair : ins) { for (const auto& var_base : name_pair.second) { if (!var_base->OverridedStopGradient()) { - PassStopGradient(outs, var_base->OverridedStopGradient()); + for (const auto& pair : outs) { + for (const auto& var : pair.second) { + if (var) { + var->SetOverridedStopGradient(false); + SetForwardDataTypeOfGradVar(var); + VLOG(3) << "Set output: " << var->Name() + << "'s OverridedStopGradient as " + << var->OverridedStopGradient(); + } + } + } return true; } } @@ -63,42 +74,51 @@ std::shared_ptr CreateGradOpNode( } } -py::object PyLayerApply(const platform::Place& place, const py::object& cls, +py::object PyLayerApply(const platform::Place& place, const py::handle& cls, const py::args args, const py::kwargs kwargs) { + py::gil_scoped_acquire guard; auto bk_function = cls.attr("_backward_function"); auto context = bk_function(); auto forward = cls.attr("forward"); auto result_forward = forward(context, *args, **kwargs); std::shared_ptr py_layer_ctx = - std::make_shared(context.release().ptr()); + std::make_shared(context.ptr()); // make inputs to varbase std::vector> input_vars; // process args,`input_vars` only collect `imperative::VarBase` if (!args.empty()) { for (auto ptr = args.begin(); ptr != args.end(); ptr++) { - try { - if (Py_None != ptr->ptr()) { + // Only collect Tensor type in 'args' and pass them to backward. Ignore + // other types of input temporarily. + if (py::isinstance(*ptr)) { + try { auto a = ptr->cast>(); input_vars.push_back(a); + } catch (py::cast_error& err) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function contains invalid argument, the " + "`%s` type argument can not be cast into `Tensor`.", + ptr->ptr()->ob_type->tp_name)); } - } catch (py::cast_error& err) { - // Only collect Tensor type in 'args' and pass them to backward. Ignore - // other types of input temporarily. } } } // process kwargs, only collect `imperative::VarBase` if (!kwargs.empty()) { for (auto ptr = kwargs.begin(); ptr != kwargs.end(); ptr++) { - try { - if (Py_None != ptr->second.ptr()) { + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. + if (py::isinstance(*ptr->second)) { + try { auto a = ptr->second.cast>(); input_vars.push_back(a); + } catch (py::cast_error&) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function contains invalid argument, the " + "`%s` type argument can not be cast into `Tensor`.", + ptr->second.ptr()->ob_type->tp_name)); } - } catch (py::cast_error&) { - // Only collect Tensor type in 'kwargs' and pass them to backward. - // Ignore other types of input temporarily. } } } @@ -109,35 +129,41 @@ py::object PyLayerApply(const platform::Place& place, const py::object& cls, PyList_Check(result_forward.ptr())) { auto tuple_result = result_forward.cast(); for (size_t i = 0; i < tuple_result.size(); i++) { - if (Py_None != tuple_result[i].ptr()) { + // Only collect Tensor type of output and pass them to backward. + // Ignore other types of input temporarily. + if (py::isinstance(tuple_result[i])) { try { auto temp_out = tuple_result[i].cast>(); output_vars.push_back(temp_out); } catch (py::cast_error&) { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` should be `Tensor`.")); + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function returns invalid argument, the " + "`%s` type argument can not be cast into `Tensor`.", + tuple_result[i].ptr()->ob_type->tp_name)); } - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` can not be `None`.")); } } } else { - if (Py_None != result_forward.ptr()) { + // Only collect Tensor type of output and pass them to backward. + // Ignore other types of input temporarily. + if (py::isinstance(result_forward)) { try { auto temp_out = result_forward.cast>(); output_vars.push_back(temp_out); } catch (py::cast_error&) { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` should be `Tensor`.")); + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.forward` function returns invalid argument, the `%s` " + "type argument can not be cast into `Tensor`.", + result_forward.ptr()->ob_type->tp_name)); } - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` can not be `None`.")); } } + if (output_vars.size() == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "At least one output of `PyLayer.forward` is a `Tensor`.")); + } NameVarBaseMap outs = {{"Out", output_vars}}; diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index a92704ce447dc1cfe1f309e0b1da8f61dd6c5a4d..0f6676ed48f349c7aa8d66459f7c74355bf53a9b 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -297,7 +297,7 @@ Reducer::Reducer(const std::vector> &vars, is_sparse_gradient_(is_sparse_gradient), parallel_ctx_(parallel_ctx), group_size_limits_(group_size_limits), - find_unused_vars_(find_unused_vars) { + find_unused_vars_each_step_(find_unused_vars) { VLOG(3) << "Start construct the Reducer ..."; nrings_ = parallel_ctx->GetNRings(); nranks_ = parallel_ctx->GetNRanks(); @@ -443,10 +443,6 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { auto *cur_node = q.front(); q.pop(); - for (auto &cur_op : *cur_node) { - cur_op.EnforceHasInOut(); - } - const auto &grad_pending_nodes = cur_node->GradPendingNodes(); for (auto &grad_pending_node : grad_pending_nodes) { PADDLE_ENFORCE_NOT_NULL( @@ -461,42 +457,8 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { } } -// After each batch is calculated, the counter of each group(group.pending_) -// and allreudce sequence counter(next_group_) will be cleaned up again. -void Reducer::PrepareForBackward( +void Reducer::TraverseBackwardGraph( const std::vector> &outputs) { - VLOG(3) << "after forward, then reset count for backward."; - next_group_ = 0; - std::for_each(groups_.begin(), groups_.end(), [](Group &group) { - group.pending_ = group.variable_indices_.size(); - group.sparse_contents_ = nullptr; - }); - - // reinitialize vars_marked_ready_ for next iteration - vars_marked_ready_.clear(); - vars_marked_ready_.resize(vars_.size(), false); - - PADDLE_ENFORCE_EQ( - groups_need_finalize_, false, - platform::errors::PreconditionNotMet( - "A serious error has occurred here. There may be several reasons: " - "1) Please note that all forward outputs derived from the module " - "parameters must participate in the calculation of losses and " - "subsequent gradient calculations. If not, the wrapper will hang, " - "waiting for autograd to generate gradients for these parameters. " - "you can use detach or stop_gradient to make the unused parameters " - "detached from the autograd graph. " - "2) Used multiple forwards and one backward. You may be able to wrap " - "multiple forwards in a model.")); - - // The first var to trigger the unused parameter - has_marked_unused_vars_ = false; - unused_vars_.clear(); - - if (!find_unused_vars_) { - return; - } - node_deps_.clear(); std::queue> q; std::unordered_set var_visited; @@ -523,7 +485,6 @@ void Reducer::PrepareForBackward( q.pop(); for (const auto &cur_op : *cur_node) { - cur_op.EnforceHasInOut(); auto &bwd_outs = cur_op.GetOutsMap(); for (const auto &pair : bwd_outs) { if (!pair.second.IsGrad()) { @@ -559,8 +520,50 @@ void Reducer::PrepareForBackward( << "] is not used"; } } +} - if (unused_vars_.empty()) { +// After each batch is calculated, the counter of each group(group.pending_) +// and allreudce sequence counter(next_group_) will be cleaned up again. +void Reducer::PrepareForBackward( + const std::vector> &outputs) { + VLOG(3) << "after forward, then reset count for backward."; + next_group_ = 0; + std::for_each(groups_.begin(), groups_.end(), [](Group &group) { + group.pending_ = group.variable_indices_.size(); + group.sparse_contents_ = nullptr; + }); + + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(vars_.size(), false); + + PADDLE_ENFORCE_EQ( + groups_need_finalize_, false, + platform::errors::PreconditionNotMet( + "A serious error has occurred here. Please " + "set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have " + "set, There may be several reasons for this error: " + "1) Please note that all forward outputs derived from the module " + "parameters must participate in the calculation of losses and " + "subsequent gradient calculations. If not, the wrapper will hang, " + "waiting for autograd to generate gradients for these parameters. " + "you can use detach or stop_gradient to make the unused parameters " + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); + + // The first var to trigger the unused parameter + has_marked_unused_vars_ = false; + + if (find_unused_vars_once_ || find_unused_vars_each_step_) { + unused_vars_.clear(); + TraverseBackwardGraph(outputs); + // only check once in first step + find_unused_vars_once_ = false; + } + + if (find_unused_vars_each_step_ && unused_vars_.empty()) { LOG_FIRST_N(WARNING, 1) << "All parameters are involved in the backward pass. " "It is recommended to set find_unused_parameters to False " @@ -569,7 +572,9 @@ void Reducer::PrepareForBackward( "will occur. Please make it clear that in the subsequent " "training, there will be no parameters that are not used " "in the backward pass, and then set find_unused_parameters"; - } else if (unused_vars_.size() == vars_.size()) { + } + + if (unused_vars_.size() == vars_.size()) { LOG_FIRST_N(WARNING, 1) << "There is no parameter in the device involved " "in the backward calculation. If there are " @@ -600,13 +605,13 @@ void Reducer::AddDistHook(size_t var_index) { local_used_vars_[var_index] = 1; - // rebuild group when find_unused_vars_ is false + // rebuild group when find_unused_vars_each_step_ is false if (NeedRebuildGroup()) { rebuild_vars_.push_back(vars_[var_index]); rebuild_var_indices_.push_back(var_index); } - if (!has_marked_unused_vars_ && find_unused_vars_) { + if (!has_marked_unused_vars_) { has_marked_unused_vars_ = true; for (const auto &unused_index : unused_vars_) { MarkVarReady(unused_index, false); @@ -627,7 +632,9 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { if (vars_marked_ready_[var_index]) { auto error_info = string::Sprintf( "Error happened, when parameter[%d][%s] has been ready before. " - "There may be several reasons for this error: " + "Please set find_unused_parameters=True to traverse backward graph " + "in each step to prepare reduce in advance. If you have set, " + "there may be several reasons for this error: " "1) In multiple reentrant backward phase, some parameters are reused." "2) Using model parameters outside of forward function. Please " "make sure that model parameters are not shared in concurrent " @@ -695,10 +702,16 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { } } else { // process sparse group - PADDLE_ENFORCE_EQ(HasGrad(var_index), true, - platform::errors::PreconditionNotMet( - "The sparse parameter[%d][%s] must have a gradient", - var_index, vars_[var_index]->Name())); + PADDLE_ENFORCE_EQ( + HasGrad(var_index), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] should have gradient. " + "Currently, DataParallel does not support sparse " + "parameters without generating gradients during training. " + "For example, if is_sparese=True is used in Embedding, " + "the current step of this parameter cannot generate gradient " + "because of stop_gradient/detatch, where error will occur.", + var_index, vars_[var_index]->Name())); auto var_base = vars_[var_index]->GradVarBase(); // need to check tensor type PADDLE_ENFORCE_EQ( @@ -762,10 +775,11 @@ void Reducer::MarkGroupReady(size_t group_index) { // TODO(liuyuhui): Add try catch to deal with exception later, // otherwise the main thread will continue to run when an exception is // thrown in comm_pool_. - comm_pool_->enqueue([&] { + auto next_group = next_group_; + comm_pool_->enqueue([this, run_order, next_group, &group] { auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; platform::SetXPUDeviceId(dev_id); - FusedAllReduceSchedule(run_order, group, next_group_); + FusedAllReduceSchedule(run_order, group, next_group); { std::lock_guard lock(mutex_); comm_op_count_ -= 1; // lock @@ -947,7 +961,7 @@ void Reducer::FinalizeBackward() { InitializeGroups(group_indices_); } - if (find_unused_vars_) { + if (find_unused_vars_each_step_) { // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ProcessUnusedDenseVars(); diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index 0d613dbea896339760d320a6b9937ffcc8ea0dcc..8392ab2c704d503a622cc09cd5a7efb8ebc680b3 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -162,13 +162,16 @@ class Reducer { std::vector> RebuildGruops(); inline bool NeedRebuildGroup() { - return !has_rebuilt_group_ && !find_unused_vars_; + return !has_rebuilt_group_ && !find_unused_vars_each_step_; } void ProcessUnusedDenseVars(); bool HasGrad(size_t var_index); + void TraverseBackwardGraph( + const std::vector>& outputs); + private: std::vector> vars_; std::vector> group_indices_; @@ -195,7 +198,8 @@ class Reducer { std::unordered_map var_index_map_; std::vector unused_vars_; bool has_marked_unused_vars_{false}; - bool find_unused_vars_{false}; + bool find_unused_vars_each_step_{false}; + bool find_unused_vars_once_{true}; bool groups_need_finalize_{false}; #ifdef PADDLE_WITH_XPU_BKCL // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training. diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc index 4a30ffb7e3d01ffa90a42278e2e5ef5271045d8a..064f47f54979a135fb83f9636ebc6f5105e7c39d 100644 --- a/paddle/fluid/imperative/tests/test_layer.cc +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -43,10 +43,12 @@ template class TestRuntimeInferVarTypeContext : public RuntimeInferVarTypeContext { public: - TestRuntimeInferVarTypeContext(const NameVarMap& inputs, - const NameVarMap& outputs, - const framework::AttributeMap& attrs_map) - : RuntimeInferVarTypeContext(inputs, outputs, attrs_map) {} + TestRuntimeInferVarTypeContext( + const NameVarMap& inputs, const NameVarMap& outputs, + const framework::AttributeMap& attrs_map, + const framework::AttributeMap& default_attrs_map) + : RuntimeInferVarTypeContext(inputs, outputs, attrs_map, + default_attrs_map) {} bool HasVar(const std::string& name) const { return RuntimeInferVarTypeContext::HasVar(name); @@ -125,7 +127,7 @@ TEST(test_layer, test_runtime_context) { auto* ctx = new imperative::TestRuntimeInferVarTypeContext( - ins, outs, attrs); + ins, outs, attrs, {}); ASSERT_TRUE(ctx->HasInput("X")); ASSERT_TRUE(ctx->HasOutput("Out")); @@ -358,7 +360,7 @@ TEST(test_layer, test_dygraph_execution_context) { framework::Scope scope; DygraphExecutionContext dy_exe_context( - *(op.get()), scope, *dev_ctx, ctx, ins, outs, concat_att_map); + *(op.get()), scope, *dev_ctx, ctx, ins, outs, concat_att_map, {}); ASSERT_EQ(dy_exe_context.InputSize("X"), 1u); ASSERT_EQ(dy_exe_context.InputName("X"), "vin"); @@ -386,7 +388,7 @@ TEST(test_layer, test_dygraph_infershape_context) { concat_att_map["axis"] = 1; DygraphInferShapeContext infer_shape_ctx( - &ins, &outs, &concat_att_map, "dummy"); + &ins, &outs, &concat_att_map, {}, "dummy"); bool have_x = infer_shape_ctx.HasOutputs("Out"); ASSERT_EQ(have_x, true); diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc index 7d6882a4ee7d005d3baec168e9e4ff32d95d619c..5e269d74044d24adc7baea8875ecd9eb2d6772c1 100644 --- a/paddle/fluid/imperative/tests/test_prepare_op.cc +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -93,7 +93,7 @@ TEST(test_prepare_op, test_prepare_op) { ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp = PreparedOp::Prepare( ins, outs, dynamic_cast(*op), - place, split_attr_map)); + place, split_attr_map, {})); } const framework::Tensor* GetTensorFromVar(const framework::Variable& var); @@ -144,7 +144,7 @@ TEST(test_prepare_op, test_prepare_data) { // test if it can be transformed to GPU place auto prepared_op = PreparedOp::Prepare( ins, outs, dynamic_cast(*op), gpu_place, - attr_map); + attr_map, {}); PrepareData( dynamic_cast(*op), ins, prepared_op.kernel_type()); @@ -193,7 +193,7 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) { // test if it never transferred on GPU place auto prepared_op = PreparedOp::Prepare( ins, outs, dynamic_cast(*op), cpu_place, - attr_map); + attr_map, {}); PrepareData( dynamic_cast(*op), ins, prepared_op.kernel_type()); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 742514c0910a23c99ab5286c23071bfcf2db0385..3d97d68b5c7dfd66e80620b3cbc2d6dc6f00d5b0 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -84,7 +84,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( if (gcs_.count(place) == 0) { std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc.reset(new framework::DefaultStreamGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place), 0)); @@ -95,7 +95,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc.reset(new framework::CUDAPinnedGarbageCollector( BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0)); @@ -120,6 +120,17 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( gc.reset(new framework::CPUGarbageCollector( BOOST_GET_CONST(platform::CPUPlace, place), 0)); VLOG(10) << "Created GarbageCollector at " << place; + } else if (platform::is_npu_place(place)) { +#if defined(PADDLE_WITH_ASCEND_CL) + // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. + gc.reset(new framework::NPUUnsafeFastGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place), 0)); + VLOG(10) << "Created GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use NPU device since it's not compiled with NPU," + "Please recompile or reinstall Paddle with NPU support.")); +#endif } else { PADDLE_THROW(platform::errors::PreconditionNotMet( "Unsupported place for garbage collection")); @@ -154,9 +165,14 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, const auto& op_info = op->Info(); auto* attr_checker = op_info.Checker(); if (attr_checker) { - attr_checker->Check(&attrs, true); + attr_checker->Check(&attrs, true, /*only_check_exist_value=*/true); } + static paddle::framework::AttributeMap empty_attrs_map = {}; + const paddle::framework::AttributeMap& default_attrs = + attr_checker == nullptr ? empty_attrs_map + : attr_checker->GetDefaultAttrMap(); + NameVarBaseMap new_ins = ins; if (enable_autocast_) { VLOG(5) << "Auto mixed precision run operator: " << type; @@ -178,10 +194,18 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with XPU if use XPUPlace.")); +#endif + } else if (platform::is_npu_place(place)) { +#ifdef PADDLE_WITH_ASCEND_CL + platform::SetNPUDeviceId( + BOOST_GET_CONST(platform::NPUPlace, place).device); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU if use NPUPlace.")); #endif } - OpBase::Run(*op, new_ins, outs, attrs, place); + OpBase::Run(*op, new_ins, outs, attrs, default_attrs, place); } catch (platform::EnforceNotMet& exception) { framework::AppendErrorOpHint(type, &exception); throw std::move(exception); @@ -204,7 +228,8 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, } if (ComputeRequiredGrad(new_ins, outs, trace_backward)) { - CreateGradOpNode(*op, new_ins, outs, attrs, place, inplace_map); + CreateGradOpNode(*op, new_ins, outs, attrs, default_attrs, place, + inplace_map); } else { VLOG(3) << "No Grad to track for Op: " << type; } diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index cace420d87c9df54387c27cecc58705c19ce5336..ebea4d0386090cc983d2edcc5a29ff5089b86ab4 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -182,15 +182,16 @@ static bool PathExists(const std::string &path) { } static std::string GetDirRoot(const std::string &path) { - char sep = '/'; - -#ifdef _WIN32 - sep = '\\'; -#endif - - size_t i = path.rfind(sep, path.length()); - if (i != std::string::npos) { - return (path.substr(0, i)); + char sep_1 = '/', sep_2 = '\\'; + + size_t i_1 = path.rfind(sep_1, path.length()); + size_t i_2 = path.rfind(sep_2, path.length()); + if (i_1 != std::string::npos && i_2 != std::string::npos) { + return path.substr(0, std::max(i_1, i_2)); + } else if (i_1 != std::string::npos) { + return path.substr(0, i_1); + } else if (i_2 != std::string::npos) { + return path.substr(0, i_2); } return path; } diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 8407f98e6dfd9bb253558242fea052846d71eb7e..4bb08dc96b1cf529c1b433092f3b9e51d03aa7e9 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -106,8 +106,8 @@ void IRPassManager::CreatePasses(Argument *argument, bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); std::string optim_cache_dir = argument->optim_cache_dir(); - bool int8_valid = - !(model_from_memory && optim_cache_dir.empty() && enable_int8); + bool int8_valid = !(model_from_memory && optim_cache_dir.empty() && + enable_int8 && use_calib_mode); PADDLE_ENFORCE_EQ( int8_valid, true, platform::errors::PreconditionNotMet( diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index fdfd2c60af0c16404953e8639385e539dc13c9b3..715316387289ccbba788aa000e175856010c4451 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -123,12 +123,27 @@ void MemoryOptimizePass::CollectVarMemorySize( } return true; }; + + // MemoryOptimizePass surppose input model is directed acyclic graph + // although it's not always the case. so black list is the best compromise + // between performance and underlying principle. + std::unordered_set black_list; + for (auto* node : graph_->Nodes()) { + if (node->IsVar() && + node->Var()->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + if (!valid_var(node)) { + black_list.emplace(node->Var()->Name()); + } + } + } + // Collect tensors from graph. for (auto* node : graph_->Nodes()) { if (node->IsVar() && node->Var()->GetType() == framework::proto::VarType::Type::VarType_Type_LOD_TENSOR && - valid_var(node)) { + !black_list.count(node->Var()->Name())) { // Parameters will not be reused. if (node->Var()->Persistable()) continue; auto shape = node->Var()->GetShape(); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 82c95ba2c95712d2ebe3aa80286689028febf3fe..c7d947c58039efa80d5b8336bc5db99cd89cee82 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -71,7 +71,7 @@ elseif (WIN32) cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) endif() -if(WITH_TESTING) +if(WITH_TESTING AND TEST test_api_impl) if(NOT APPLE) set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120) endif() diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 853c1ac1da8742733e609c1dea098a208eadc015..b5ca0ef5924397544882741078043d747a145ebf 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -421,7 +421,6 @@ void AnalysisConfig::Update() { pass_builder()->AppendPass(pass); } } - LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl; if (use_dlnne_) { pass_builder()->ClearPasses(); for (const auto &pass : kDlnneSubgraphPasses) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6a6be14fd5977dcb7a7909b17a7684780391042c..1aa46ab571338f853d1e450bf404330c61e9f10b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -152,8 +152,8 @@ bool AnalysisPredictor::Init( : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } else { - LOG(INFO) << "Profiler is deactivated, and no profiling report will be " - "generated."; + VLOG(2) << "Profiler is deactivated, and no profiling report will be " + "generated."; } // no matter with or without MKLDNN @@ -191,22 +191,8 @@ bool AnalysisPredictor::PrepareScope( status_is_cloned_ = true; } else { paddle::framework::InitDevices(); - scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) { - delete scope; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount(); - ++dev_id) { - memory::Release(platform::CUDAPlace(dev_id)); - } -#endif -#ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount(); - ++dev_id) { - memory::Release(platform::XPUPlace(dev_id)); - } -#endif - memory::Release(platform::CPUPlace()); - }); + // TODO(wilber): we need to release memory occupied by weights. + scope_.reset(new paddle::framework::Scope()); status_is_cloned_ = false; } sub_scope_ = &scope_->NewScope(); @@ -284,7 +270,48 @@ bool AnalysisPredictor::CreateExecutor() { executor_.reset(new paddle::framework::NaiveExecutor(place_)); return true; } + +static bool IsPrepareDataOptTargetOp(framework::OpDesc *op) { + // here is prepare data optimization related bad cases: + // let's assume an op behind conditional_block and if conditional_block + // chooses branch 1, the op need to call prepare data. else the op don't need + // to call prepare data. In running, if predictor chooses branch 2, then + // optimization takes effect, later issue is followed if predictor chooses + // branch 1, because the op lost chance to prepare data. + std::vector op_type = {"conditional_block_infer", + "select_input"}; + for (const auto &type : op_type) { + if (op->Type() == type) { + return true; + } + } + return false; +} + +static void DisablePrepareDataOpt( + std::shared_ptr inference_program, int block, + bool pre_disable_opt) { + bool disable_opt = false; + auto &infer_block = inference_program->Block(block); + for (auto *op : infer_block.AllOps()) { + if (disable_opt || pre_disable_opt) { + op->SetAttr("inference_force_prepare_data", true); + } + if (op->HasAttr("sub_block")) { + int blockID = op->GetBlockAttrId("sub_block"); + DisablePrepareDataOpt(inference_program, blockID, + disable_opt || pre_disable_opt); + } + // disable prepare data if unfriendly op is found + if (!disable_opt) { + disable_opt = IsPrepareDataOptTargetOp(op); + } + } +} + bool AnalysisPredictor::PrepareExecutor() { + DisablePrepareDataOpt(inference_program_, 0, false); + executor_->Prepare(sub_scope_, *inference_program_, 0, config_.use_feed_fetch_ops_); @@ -316,8 +343,6 @@ void AnalysisPredictor::MkldnnPreSet( platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id( platform::MKLDNNDeviceContextThreadLocals:: kMKLDNNSessionID_CacheClearing); - platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity( - config_.mkldnn_cache_capacity_); // Set current_input_shape for caching dynamic shape. std::stringstream ss; for (size_t i = 0; i < inputs_shape.size(); ++i) { @@ -328,6 +353,9 @@ void AnalysisPredictor::MkldnnPreSet( VLOG(2) << "Set input shape=" << ss.str(); platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str(ss.str()); } + platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity( + config_.mkldnn_cache_capacity_); + #endif } @@ -343,10 +371,9 @@ void AnalysisPredictor::MkldnnPostReset() { CHECK_LE(shape_blob_size, static_cast(config_.mkldnn_cache_capacity_)); } - paddle::platform::MKLDNNDeviceContext::tls().set_cur_mkldnn_session_id( - platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default); - platform::MKLDNNDeviceContext::tls().set_cur_input_shape_cache_capacity(0); - platform::MKLDNNDeviceContext::tls().set_cur_input_shape_str(""); + // We cannot reset to the default cache settings + // as there maybe CopyToCPU method used and oneDNN + // primitives are used there so cache would grow } #endif } @@ -664,13 +691,6 @@ std::unique_ptr CreatePaddlePredictor< gflags.push_back("--cudnn_deterministic=True"); } - if (config.thread_local_stream_enabled()) { - gflags.push_back("--allocator_strategy=thread_local"); - process_level_allocator_enabled = false; - } else { - process_level_allocator_enabled = true; - } - // TODO(wilber): jetson tx2 may fail to run the model due to insufficient memory // under the native_best_fit strategy. Modify the default allocation strategy to // auto_growth. todo, find a more appropriate way to solve the problem. @@ -678,6 +698,15 @@ std::unique_ptr CreatePaddlePredictor< gflags.push_back("--allocator_strategy=auto_growth"); #endif + // TODO(Shixiaowei02): Add a mandatory scheme to use the thread local + // allocator when multi-stream is enabled. + if (config.thread_local_stream_enabled()) { + gflags.push_back("--allocator_strategy=thread_local"); + process_level_allocator_enabled = false; + } else { + process_level_allocator_enabled = true; + } + if (framework::InitGflags(gflags)) { VLOG(3) << "The following gpu analysis configurations only take effect " "for the first predictor: "; @@ -1209,6 +1238,9 @@ USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); +USE_TRT_CONVERTER(reshape); +USE_TRT_CONVERTER(reduce_sum); +USE_TRT_CONVERTER(gather_nd); #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 0a09b062803f6ea15d8b3fa361d60e91e9ccc4b9..47abe3298aa7c4c8d5857ad8184b65dfef39b417 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -72,8 +72,12 @@ if(WITH_GPU) if(NOT WIN32) set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") else() - if(CUDA_LIB STREQUAL "") - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + if(NOT DEFINED CUDA_LIB) + if(DEFINED ENV{CUDA_PATH}) + set(CUDA_LIB "$ENV{CUDA_PATH}\\lib\\x64") + else() + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\lib\\x64") + endif() endif() endif(NOT WIN32) endif() diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 53f925966662667571ef39a5d51dc4536479c295..bf5de2d748a36b80b63f4b1795fa4bbc4d7f6776 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -85,7 +85,7 @@ for WITH_STATIC_LIB in ON OFF; do if [ $(echo `uname` | grep "Win") != "" ]; then # TODO(wilber, T8T9): Do we still need to support windows gpu static library if [ $TEST_GPU_CPU == ON ] && [ $WITH_STATIC_LIB == ON ]; then - return 0 + continue fi # -----simple_on_word2vec on windows----- cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \ diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index f7dbfd39cd26e6af40d7536d76fd031bee5a331c..313cbfb7c786e967611c6d99ebbf1e843973e9a0 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -103,6 +104,8 @@ DataType Tensor::type() const { return DataType::INT32; } else if (type == paddle::framework::proto::VarType::UINT8) { return DataType::UINT8; + } else if (type == paddle::framework::proto::VarType::INT8) { + return DataType::INT8; } return DataType::FLOAT32; } @@ -161,8 +164,24 @@ void Tensor::CopyToCpu(T *data) { auto *t_data = tensor->data(); auto t_place = tensor->place(); + paddle::framework::Tensor out; + auto mem_allocation = std::make_shared( + static_cast(data), ele_num * sizeof(T), + paddle::platform::CPUPlace()); + out.ResetHolder(mem_allocation); + if (paddle::platform::is_cpu_place(t_place)) { +#ifdef PADDLE_WITH_MKLDNN + if (tensor->layout() == paddle::framework::DataLayout::kMKLDNN) + paddle::framework::innerTransDataLayoutFromMKLDNN( + tensor->layout(), paddle::platform::MKLDNNDeviceContext::tls() + .get_cur_paddle_data_layout(), + *tensor, &out, paddle::platform::CPUPlace(), true); + else + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); +#else std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); +#endif } else if (place_ == PlaceType::kGPU) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) paddle::platform::DeviceContextPool &pool = diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index 793fc53d90b768050572a3dd0a080a5d30e959a2..f6cdbb00b50453d4c4ff7fc06ba82aa042dd194a 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -411,7 +411,8 @@ void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::MKLDNNDeviceContext* dev_ctx = (platform::MKLDNNDeviceContext*)pool.Get(predictor_.place_); - dev_ctx->ResetBlobMap(); + dev_ctx->ResetBlobMap( + paddle::platform::MKLDNNDeviceContext::tls().get_curr_exec()); } void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2bbd4bb837a22f672e5aa625f299424b6f0c5b88..81e742e8a6f6853459740d4d9c4be7dfef8dfaa3 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -31,6 +31,7 @@ #include #include #include + #include "paddle_infer_declare.h" // NOLINT /*! \file */ @@ -177,6 +178,26 @@ struct PD_INFER_DECL AnalysisConfig { /// void DisableGpu(); + /// + /// \brief Turn on XPU. + /// + /// \param l3_workspace_size The size of the video memory allocated by the l3 + /// cache, the maximum is 16M. + /// \param locked Whether the allocated L3 cache can be locked. If false, + /// it means that the L3 cache is not locked, and the allocated L3 + /// cache can be shared by multiple models, and multiple models + /// sharing the L3 cache will be executed sequentially on the card. + /// \param autotune Whether to autotune the conv operator in the model. If + /// true, when the conv operator of a certain dimension is executed + /// for the first time, it will automatically search for a better + /// algorithm to improve the performance of subsequent conv operators + /// of the same dimension. + /// \param autotune_file Specify the path of the autotune file. If + /// autotune_file is specified, the algorithm specified in the + /// file will be used and autotune will not be performed again. + /// \param precision Calculation accuracy of multi_encoder + /// \param adaptive_seqlen Is the input of multi_encoder variable length + /// void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, bool autotune = true, const std::string& autotune_file = "", const std::string& precision = "int16", @@ -294,7 +315,7 @@ struct PD_INFER_DECL AnalysisConfig { /// workspace. /// \param max_batch_size The maximum batch size of this prediction task, /// better set as small as possible for less performance loss. - /// \param min_subgrpah_size The minimum TensorRT subgraph size needed, if a + /// \param min_subgraph_size The minimum TensorRT subgraph size needed, if a /// subgraph is smaller than this, it will not be transferred to TensorRT /// engine. /// \param precision The precision used in TensorRT. @@ -678,7 +699,7 @@ struct PD_INFER_DECL AnalysisConfig { bool xpu_adaptive_seqlen_; // mkldnn related. - int mkldnn_cache_capacity_{0}; + int mkldnn_cache_capacity_{10}; bool use_mkldnn_quantizer_{false}; std::shared_ptr mkldnn_quantizer_config_; bool use_mkldnn_bfloat16_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 2b7333edae0dae1f0313bf71fc824c922e20b84d..b2e3de63691c555b24eb6f1e1fb9ffcc35d400f9 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -112,6 +112,7 @@ const std::vector kTRTSubgraphPasses({ const std::vector kDlnneSubgraphPasses({ "is_test_pass", // + "delete_dropout_op_pass" // "simplify_with_basic_ops_pass", // "conv_bn_fuse_pass", // "depthwise_conv_bn_fuse_pass", // diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h index 4b70ed7fbad297efdf1863317e3af2b69bed702b..e7f7ac88687e7c64cb554c24eb6c6b496d63326b 100644 --- a/paddle/fluid/inference/capi_exp/pd_common.h +++ b/paddle/fluid/inference/capi_exp/pd_common.h @@ -71,5 +71,5 @@ PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU, PD_ENUM(PD_DataType){ PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32, - PD_DATA_INT64, PD_DATA_UINT8, + PD_DATA_INT64, PD_DATA_UINT8, PD_DATA_INT8, }; diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc index c45454e86bdaac5e8f054da91410eab7e2b873a2..e9104ef52376cd8f36358dba005c636f9f435a3d 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.cc +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/inference/capi_exp/pd_config.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" #include "paddle/fluid/platform/enforce.h" #define CHECK_NULL_POINTER_PARM(param) \ @@ -125,10 +127,14 @@ PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { } void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, - int32_t l3_workspace_size) { + int32_t l3_workspace_size, PD_Bool locked, + PD_Bool autotune, const char* autotune_file, + const char* precision, PD_Bool adaptive_seqlen) { CHECK_AND_CONVERT_PD_CONFIG; - config->EnableXpu(l3_workspace_size); + config->EnableXpu(l3_workspace_size, locked, autotune, autotune_file, + precision, adaptive_seqlen); } + PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { CHECK_AND_CONVERT_PD_CONFIG; return config->use_xpu(); @@ -378,5 +384,24 @@ void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) { CHECK_AND_CONVERT_PD_CONFIG; config->PartiallyRelease(); } +void PD_ConfigDeletePass(__pd_keep PD_Config* pd_config, const char* pass) { + CHECK_AND_CONVERT_PD_CONFIG; + config->pass_builder()->DeletePass(pass); +} +void PD_ConfigInsertPass(__pd_keep PD_Config* pd_config, size_t idx, + const char* pass) { + CHECK_AND_CONVERT_PD_CONFIG; + config->pass_builder()->InsertPass(idx, pass); +} +void PD_ConfigAppendPass(__pd_keep PD_Config* pd_config, const char* pass) { + CHECK_AND_CONVERT_PD_CONFIG; + config->pass_builder()->AppendPass(pass); +} +__pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses( + __pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + std::vector passes = config->pass_builder()->AllPasses(); + return paddle_infer::CvtVecToOneDimArrayCstr(passes); +} } // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h index e44983e24484eae930afa6b84db397ac3aad8f08..a47ca5d27687f710aa1c0bb6db4bf830492175aa 100644 --- a/paddle/fluid/inference/capi_exp/pd_config.h +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -25,6 +25,7 @@ #pragma once #include "pd_common.h" // NOLINT +#include "pd_types.h" // NOLINT typedef struct PD_Config PD_Config; @@ -154,10 +155,27 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( /// \brief Turn on XPU. /// /// \param[in] pd_onfig config -/// \param[in] l3_workspace_size l3 workspace size. +/// \param l3_workspace_size The size of the video memory allocated by the l3 +/// cache, the maximum is 16M. +/// \param locked Whether the allocated L3 cache can be locked. If false, +/// it means that the L3 cache is not locked, and the allocated L3 +/// cache can be shared by multiple models, and multiple models +/// sharing the L3 cache will be executed sequentially on the card. +/// \param autotune Whether to autotune the conv operator in the model. If +/// true, when the conv operator of a certain dimension is executed +/// for the first time, it will automatically search for a better +/// algorithm to improve the performance of subsequent conv operators +/// of the same dimension. +/// \param autotune_file Specify the path of the autotune file. If +/// autotune_file is specified, the algorithm specified in the +/// file will be used and autotune will not be performed again. +/// \param precision Calculation accuracy of multi_encoder +/// \param adaptive_seqlen Is the input of multi_encoder variable length /// PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( - __pd_keep PD_Config* pd_config, int32_t l3_workspace_size); + __pd_keep PD_Config* pd_config, int32_t l3_workspace_size, PD_Bool locked, + PD_Bool autotune, const char* autotune_file, const char* precision, + PD_Bool adaptive_seqlen); /// /// \brief A boolean state telling whether the XPU is turned on. /// @@ -565,6 +583,35 @@ PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid( /// PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease( __pd_keep PD_Config* pd_config); +/// +/// \brief Delete all passes that has a certain type 'pass'. +/// +/// \param[in] pass the certain pass type to be deleted. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDeletePass( + __pd_keep PD_Config* pd_config, const char* pass); +/// +/// \brief Insert a pass to a specific position +/// +/// \param[in] idx the position to insert. +/// \param[in] pass the new pass. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigInsertPass( + __pd_keep PD_Config* pd_config, size_t idx, const char* pass); +/// +/// \brief Append a pass to the end of the passes +/// +/// \param[in] pass the new pass. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigAppendPass( + __pd_keep PD_Config* pd_config, const char* pass); +/// +/// \brief Get information of passes. +/// +/// \return Return list of the passes. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* PD_ConfigAllPasses( + __pd_keep PD_Config* pd_config); #ifdef __cplusplus } // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc index f5287a5152957f5cda0db9dee82a7689267cd3d2..5ca58b0e4138b274c67cbd988388acc30a0368ae 100644 --- a/paddle/fluid/inference/capi_exp/pd_predictor.cc +++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc @@ -106,4 +106,9 @@ void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) { delete pd_predictor; } +const char* PD_GetVersion() { + static std::string version = paddle_infer::GetVersion(); + return version.c_str(); +} + } // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h index d4542d0b6d394d2ebc67e6f63b0b52cefb5939b3..33d5160bc3e0d1b1f14c2e9e34e1885ee8ae4f72 100644 --- a/paddle/fluid/inference/capi_exp/pd_predictor.h +++ b/paddle/fluid/inference/capi_exp/pd_predictor.h @@ -143,6 +143,13 @@ PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory( PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy( __pd_take PD_Predictor* pd_predictor); +/// +/// \brief Get version info. +/// +/// \return version +/// +PADDLE_CAPI_EXPORT extern const char* PD_GetVersion(); + #ifdef __cplusplus } // extern "C" #endif diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc index 2e762619f5567c3fce05272815f9a8a0f17d267c..94362b8784bb3501d38799296f88bbfaa05bb176 100644 --- a/paddle/fluid/inference/capi_exp/pd_utils.cc +++ b/paddle/fluid/inference/capi_exp/pd_utils.cc @@ -196,6 +196,8 @@ DataType CvtToCxxDatatype(PD_DataType data_type) { return DataType::INT32; case PD_DATA_UINT8: return DataType::UINT8; + case PD_DATA_INT8: + return DataType::INT8; default: PADDLE_THROW(paddle::platform::errors::InvalidArgument( "Unsupport paddle data type %d.", data_type)); diff --git a/paddle/fluid/inference/goapi/README.md b/paddle/fluid/inference/goapi/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6664014bf937b84583c47ed10d35331a34493de4 --- /dev/null +++ b/paddle/fluid/inference/goapi/README.md @@ -0,0 +1,107 @@ +# Paddle Inference golang API + +Paddle Inference golang API 基于 [capi](../capi_exp) 和 cgo 实现,需要您提前准备好C预测库。 + +## 安装 + +1. 确认使用Paddle的CommitId + +您可以通过`git log -1`的方式,确认您使用的Paddle版本的CommitId + +2. 使用`go get`获取golang paddle api + +``` +# 此处使用上一步记录的CommitId,假设为0722297 +COMMITID=0722297 +go get -d -v github.com/paddlepaddle/paddle/paddle/fluid/inference/goapi@${COMMITID} +``` + +3. 下载C预测库 + +您可以选择直接下载[paddle_inference_c](https://github.com/PaddlePaddle/Paddle-Inference-Demo/blob/master/docs/user_guides/download_lib.md)预测库,或通过源码编译的方式安装,源码编译方式参考官网文档,注意这里cmake编译时打开`-DON_INFER=ON`,在编译目录下得到`paddle_inference_c_install_dir`。 + + +4. 软链 + +go1.15新增了`GOMODCACHE`环境变量,`go get`默认会将代码下载到`GOMODCACHE`目录下,您可以通过`go env | grep GOMODCACHE`的方式,查看该路径,在官网发布的docker镜像中该路径一般默认为`/root/gopath/pkg/mod`,进入到golang api代码路径建立软连接,将c预测库命名为`paddle_inference_c`。 + +```bash +eval $(go env | grep GOMODCACHE) +# 按需修改最后的goapi版本号 +cd ${GOMODCACHE}/github.com/paddlepaddle/paddle/paddle/fluid/inference/goapi\@v0.0.0-20210623023452-0722297d9b8c/ +ln -s ${PADDLE_C_DOWNLOAD_DIR}/paddle_inference_c_install_dir paddle_inference_c +``` + +5. 运行单测,验证 + +``` +bash test.sh +``` + +## 在Go中使用Paddle预测 + +首先创建预测配置 +```go +config := paddle.NewConfig() +config.SetModel(model_file, params_file) +``` + +创建predictor +```go +predictor := paddle.NewPredictor(config) +``` + +获取输入Tensor和输出Tensor +```go +inNames := predictor.GetInputNames() +inHandle = predictor.GetInputHandle(inNames[0]) + +outNames := predictor.GetOutputNames() +outHandle := predictor.GetOutputHandle(outNames[0]) +``` + +设置输入数据(假设只有一个输入) +```go +data := make([]float32, 1*3*224*224) +for i := 0; i < len(data); i++ { + data[i] = float32(i%255) * 0.1 +} +inHandle.Reshape([]int32{1, 3, 224, 224}) +inHandle.CopyFromCpu(data) +``` + +设置Lod +```go +lod := make([][]uint, 2) +for i:=0; i < len(lod); i++ { + lod[i] = make([]uint, 2) + // 设置输入... + lod[i][0] = 0 + lod[i][0] = 10 +} +inHandle.SetLod(lod) +``` + +运行预测 +```go +predictor.Run() +``` + +获取输入Tensor的真实值 +```go +func numElements(shape []int32) int32 { + n := int32(1) + for _, v := range shape { + n *= v + } + return n +} + +outData := make([]float32, numElements(outHandle.Shape())) +outHandle.CopyToCpu(outData) +fmt.Println(outHandle.Lod()) +``` + +## 示例 + +Demo示例见[Paddle-Inference-Demo](https://github.com/PaddlePaddle/Paddle-Inference-Demo/tree/master/go) diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go new file mode 100644 index 0000000000000000000000000000000000000000..9200de3d08f71c54f3778e324865712f97eafc9b --- /dev/null +++ b/paddle/fluid/inference/goapi/config.go @@ -0,0 +1,735 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package paddle + +// #include "pd_config.h" +// #include "pd_common.h" +// #include "pd_types.h" +// #include "pd_utils.h" +// #include +// #include +import "C" +import ( + "unsafe" +) + +type Precision C.PD_PrecisionType + +const ( + PrecisionFloat32 Precision = C.PD_PRECISION_FLOAT32 + PrecisionInt8 Precision = C.PD_PRECISION_INT8 + PrecisionHalf Precision = C.PD_PRECISION_HALF +) + +type Config struct { + c *C.PD_Config +} + +/// +/// \brief Create a new config. +/// +func NewConfig() *Config { + cConfig := C.PD_ConfigCreate() + config := &Config{c: cConfig} + return config +} + +/// +/// \brief Set the combined model with two specific pathes for program and +/// parameters. +/// +/// \param model model file path of the combined model. +/// \param params params file path of the combined model. +/// +func (config *Config) SetModel(model, params string) { + cmodel := C.CString(model) + cparams := C.CString(params) + C.PD_ConfigSetModel(config.c, cmodel, cparams) + defer func() { + C.free(unsafe.Pointer(cmodel)) + C.free(unsafe.Pointer(cparams)) + }() +} + +/// +/// \brief Set the no-combined model dir path. +/// +/// \param modelDir model dir path. +/// +func (config *Config) SetModelDir(modelDir string) { + cmodel := C.CString(modelDir) + C.PD_ConfigSetModelDir(config.c, cmodel) + defer C.free(unsafe.Pointer(cmodel)) +} + +/// +/// \brief Set the model file path of a combined model. +/// +/// \param x model file path. +/// +func (config *Config) SetProgFile(model string) { + cmodel := C.CString(model) + C.PD_ConfigSetProgFile(config.c, cmodel) + defer C.free(unsafe.Pointer(cmodel)) +} + +/// +/// \brief Set the params file path of a combined model. +/// +/// \param x params file path. +/// +func (config *Config) SetParamsFile(params string) { + cparams := C.CString(params) + C.PD_ConfigSetParamsFile(config.c, cparams) + defer C.free(unsafe.Pointer(cparams)) +} + +/// +/// \brief Set the path of optimization cache directory. +/// +/// \param cacheDir the path of optimization cache directory. +/// +func (config *Config) SetOptimCacheDir(cacheDir string) { + ccacheDir := C.CString(cacheDir) + C.PD_ConfigSetOptimCacheDir(config.c, ccacheDir) + defer C.free(unsafe.Pointer(ccacheDir)) +} + +/// +/// \brief Get the model directory path. +/// +/// \return string The model directory path. +/// +func (config *Config) ModelDir() string { + return C.GoString(C.PD_ConfigGetModelDir(config.c)) +} + +/// +/// \brief Get the program file path. +/// +/// \return string The program file path. +/// +func (config *Config) ProgFile() string { + return C.GoString(C.PD_ConfigGetProgFile(config.c)) +} + +/// +/// \brief Get the combined parameters file. +/// +/// \return string The combined parameters file. +/// +func (config *Config) ParamsFile() string { + return C.GoString(C.PD_ConfigGetParamsFile(config.c)) +} + +/// +/// \brief Turn off FC Padding. +/// +func (config *Config) DisableFCPadding() { + C.PD_ConfigDisableFCPadding(config.c) +} + +/// +/// \brief A boolean state telling whether fc padding is used. +/// +/// \return bool Whether fc padding is used. +/// +func (config *Config) UseFcPadding() bool { + return cvtPDBoolToGo(C.PD_ConfigUseFcPadding(config.c)) +} + +/// +/// \brief Turn on GPU. +/// +/// \param memorySize initial size of the GPU memory pool in MB. +/// \param deviceId the GPU card to use. +/// +func (config *Config) EnableUseGpu(memorySize uint64, deviceId int32) { + C.PD_ConfigEnableUseGpu(config.c, C.uint64_t(memorySize), C.int32_t(deviceId)) +} + +/// +/// \brief Turn on XPU. +/// +/// \param l3_workspace_size The size of the video memory allocated by the l3 cache, the maximum is 16M. +/// \param locked Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked, and the allocated L3 cache can be shared by multiple models, and multiple models sharing the L3 cache will be executed sequentially on the card. +/// \param autotune Whether to autotune the conv operator in the model. If true, when the conv operator of a certain dimension is executed for the first time, it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension. +/// \param autotune_file Specify the path of the autotune file. If autotune_file is specified, the algorithm specified in the file will be used and autotune will not be performed again. +/// \param precision Calculation accuracy of multi_encoder +/// \param adaptive_seqlen Is the input of multi_encoder variable length +/// +func (config *Config) EnableXpu(l3WorkspaceSize int32, locked bool, autotune bool, autotuneFile string, precision string, adaptiveSeqlen bool) { + cAutotuneFile := C.CString(autotuneFile) + cPrecision := C.CString(precision) + defer func() { + C.free(unsafe.Pointer(cAutotuneFile)) + C.free(unsafe.Pointer(cPrecision)) + }() + C.PD_ConfigEnableXpu(config.c, C.int32_t(l3WorkspaceSize), cvtGoBoolToPD(locked), cvtGoBoolToPD(autotune), + cAutotuneFile, cPrecision, cvtGoBoolToPD(adaptiveSeqlen)) +} + +/// +/// \brief A boolean state telling whether the GPU is turned on. +/// +/// \return bool Whether the GPU is turned on. +/// +func (config *Config) UseGpu() bool { + return cvtPDBoolToGo(C.PD_ConfigUseGpu(config.c)) +} + +/// +/// \brief A boolean state telling whether the XPU is turned on. +/// +/// \return bool Whether the XPU is turned on. +/// +func (config *Config) UseXpu() bool { + return cvtPDBoolToGo(C.PD_ConfigUseXpu(config.c)) +} + +/// +/// \brief Get the GPU device id. +/// +/// \return int32 The GPU device id. +/// +func (config *Config) GpuDeviceId() int32 { + return int32(C.PD_ConfigGpuDeviceId(config.c)) +} + +/// +/// \brief Get the XPU device id. +/// +/// \return int32 The XPU device id. +/// +func (config *Config) XpuDeviceId() int32 { + return int32(C.PD_ConfigXpuDeviceId(config.c)) +} + +/// +/// \brief Get the initial size in MB of the GPU memory pool. +/// +/// \return int32 The initial size in MB of the GPU memory pool. +/// +func (config *Config) MemoryPoolInitSizeMb() int32 { + return int32(C.PD_ConfigMemoryPoolInitSizeMb(config.c)) +} + +/// +/// \brief Get the proportion of the initial memory pool size compared to the +/// device. +/// +/// \return float32 The proportion of the initial memory pool size. +/// +func (config *Config) FractionOfGpuMemoryForPool() float32 { + return float32(C.PD_ConfigFractionOfGpuMemoryForPool(config.c)) +} + +/// +/// \brief Control whether to perform IR graph optimization. +/// If turned off, the AnalysisConfig will act just like a NativeConfig. +/// +/// \param x Whether the ir graph optimization is actived. +/// +func (config *Config) SwitchIrOptim(x bool) { + C.PD_ConfigSwitchIrOptim(config.c, cvtGoBoolToPD(x)) +} + +/// +/// \brief A boolean state telling whether the ir graph optimization is +/// actived. +/// +/// \return bool Whether to use ir graph optimization. +/// +// bool ir_optim() const { return enable_ir_optim_; } +func (config *Config) IrOptim() bool { + return cvtPDBoolToGo(C.PD_ConfigIrOptim(config.c)) +} + +/// +/// \brief Turn on the TensorRT engine. +/// The TensorRT engine will accelerate some subgraphes in the original Fluid +/// computation graph. In some models such as resnet50, GoogleNet and so on, +/// it gains significant performance acceleration. +/// +/// \param workspaceSize The memory size(in byte) used for TensorRT +/// workspace. +/// \param maxBatchSize The maximum batch size of this prediction task, +/// better set as small as possible for less performance loss. +/// \param minSubgraphSize The minimum TensorRT subgraph size needed, if a +/// subgraph is smaller than this, it will not be transferred to TensorRT +/// engine. +/// \param precision The precision used in TensorRT. +/// \param useStatic Serialize optimization information to disk for reusing. +/// \param useCalibMode Use TRT int8 calibration(post training +/// quantization). +/// +func (config *Config) EnableTensorRtEngine(workspaceSize int32, maxBatchSize int32, minSubgraphSize int32, + precision Precision, useStatic bool, useCalibMode bool) { + C.PD_ConfigEnableTensorRtEngine(config.c, C.int32_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode)) +} + +/// +/// \brief A boolean state telling whether the TensorRT engine is used. +/// +/// \return bool Whether the TensorRT engine is used. +/// +func (config *Config) TensorRtEngineEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigTensorRtEngineEnabled(config.c)) +} + +/// +/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode. +/// \param minInputShape The min input shape of the subgraph input. +/// \param maxInputShape The max input shape of the subgraph input. +/// \param optimInputShape The opt input shape of the subgraph input. +/// \param disableTrtPluginFp16 Setting this parameter to true means that +/// TRT plugin will not run fp16. +/// +func (config *Config) SetTRTDynamicShapeInfo(minInputShape map[string][]int32, maxInputShape map[string][]int32, + optimInputShape map[string][]int32, disableTrtPluginFp16 bool) { + + tensorNum := uint(len(minInputShape)) + names := make([](*C.char), tensorNum) + goNames := make([]string, tensorNum) + var shapeNum []uint + + idx := 0 + for n := range minInputShape { + char := C.CString(n) + defer C.free(unsafe.Pointer(char)) + names[idx] = (*C.char)(unsafe.Pointer(char)) + goNames[idx] = n + shapeNum = append(shapeNum, uint(len(minInputShape[n]))) + idx++ + } + + cMinInputShape := make([]*C.int32_t, len(goNames)) + cMaxInputShape := make([]*C.int32_t, len(goNames)) + cOptInputShape := make([]*C.int32_t, len(goNames)) + for i, n := range goNames { + pMin := (*C.int32_t)(C.malloc(C.size_t(C.sizeof_int32_t * len(minInputShape[n])))) + cMinInputShape[i] = pMin + + // A []C.int32_t slice backed by C memory. + // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices + // Using [1<<27] instead of [1<<30] so it works on 32-bit architecture + pMinData := (*[1 << 27]C.int32_t)(unsafe.Pointer(pMin)) + for j, v := range minInputShape[n] { + (*pMinData)[j] = C.int32_t(v) + } + defer C.free(unsafe.Pointer(pMin)) + + pMax := (*C.int32_t)(C.malloc(C.size_t(C.sizeof_int32_t * len(maxInputShape[n])))) + cMaxInputShape[i] = pMax + pMaxData := (*[1 << 27]C.int32_t)(unsafe.Pointer(pMax)) + for j, v := range maxInputShape[n] { + (*pMaxData)[j] = C.int32_t(v) + } + defer C.free(unsafe.Pointer(pMax)) + + pOpt := (*C.int32_t)(C.malloc(C.size_t(C.sizeof_int32_t * len(optimInputShape[n])))) + cOptInputShape[i] = pOpt + pOptData := (*[1 << 27]C.int32_t)(unsafe.Pointer(pOpt)) + for j, v := range optimInputShape[n] { + (*pOptData)[j] = C.int32_t(v) + } + defer C.free(unsafe.Pointer(pOpt)) + } + + C.PD_ConfigSetTrtDynamicShapeInfo(config.c, C.size_t(tensorNum), (**C.char)(unsafe.Pointer(&names[0])), + (*C.size_t)(unsafe.Pointer(&shapeNum[0])), + (**C.int32_t)(unsafe.Pointer(&cMinInputShape[0])), + (**C.int32_t)(unsafe.Pointer(&cMaxInputShape[0])), + (**C.int32_t)(unsafe.Pointer(&cOptInputShape[0])), + cvtGoBoolToPD(disableTrtPluginFp16)) +} + +/// +/// \brief Prevent ops running in Paddle-TRT +/// NOTE: just experimental, not an official stable API, easy to be broken. +/// +func (config *Config) DisableTensorRtOPs(ops []string) { + num := uint(len(ops)) + var buf = make([]*C.char, num+1) + for i, _ := range ops { + char := C.CString(ops[i]) + defer C.free(unsafe.Pointer(char)) + buf[i] = (*C.char)(unsafe.Pointer(char)) + } + + C.PD_ConfigDisableTensorRtOPs(config.c, C.size_t(num), (**C.char)(unsafe.Pointer(&buf[0]))) +} + +/// +/// \brief Replace some TensorRT plugins to TensorRT OSS( +/// https://github.com/NVIDIA/TensorRT), with which some models's inference +/// may be more high-performance. Libnvinfer_plugin.so greater than +/// V7.2.1 is needed. +/// +func (config *Config) EnableTensorRtOSS() { + C.PD_ConfigEnableTensorRtOSS(config.c) +} + +/// +/// \brief A boolean state telling whether to use the TensorRT OSS. +/// +/// \return bool Whether to use the TensorRT OSS. +/// +func (config *Config) TensorrtOssEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigTensorRtOssEnabled(config.c)) +} + +/// +/// \brief Enable TensorRT DLA +/// \param dlaCore ID of DLACore, which should be 0, 1, +/// ..., IBuilder.getNbDLACores() - 1 +/// +func (config *Config) EnableTensorRtDLA(dlaCore int32) { + C.PD_ConfigEnableTensorRtDla(config.c, C.int32_t(dlaCore)) +} + +/// +/// \brief A boolean state telling whether to use the TensorRT DLA. +/// +/// \return bool Whether to use the TensorRT DLA. +/// +func (config *Config) TensorrtDlaEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigTensorRtDlaEnabled(config.c)) +} + +/// +/// \brief Turn on the usage of Lite sub-graph engine. +/// +/// \param precision Precion used in Lite sub-graph engine. +/// \param zeroCopy Set the zero copy mode. +/// \param passesFilter Set the passes used in Lite sub-graph engine. +/// \param opsFilter Operators not supported by Lite. +/// +func (config *Config) EnableLiteEngine(precision Precision, zeroCopy bool, passesFilter []string, opsFilter []string) { + passesFilterNum := uint(len(passesFilter)) + var passesFilterBuf = make([]*C.char, passesFilterNum+1) + for i, _ := range passesFilter { + char := C.CString(passesFilter[i]) + defer C.free(unsafe.Pointer(char)) + passesFilterBuf[i] = (*C.char)(unsafe.Pointer(char)) + } + + opsFilterNum := uint(len(opsFilter)) + var opsFilterBuf = make([]*C.char, passesFilterNum+1) + for i, _ := range opsFilter { + char := C.CString(opsFilter[i]) + defer C.free(unsafe.Pointer(char)) + opsFilterBuf[i] = (*C.char)(unsafe.Pointer(char)) + } + + C.PD_ConfigEnableLiteEngine(config.c, C.int32_t(precision), cvtGoBoolToPD(zeroCopy), C.size_t(passesFilterNum), (**C.char)(unsafe.Pointer(&passesFilterBuf[0])), C.size_t(opsFilterNum), (**C.char)(unsafe.Pointer(&opsFilterBuf[0]))) +} + +/// +/// \brief A boolean state indicating whether the Lite sub-graph engine is +/// used. +/// +/// \return bool whether the Lite sub-graph engine is used. +/// +func (config *Config) LiteEngineEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigLiteEngineEnabled(config.c)) +} + +/// +/// \brief Control whether to debug IR graph analysis phase. +/// This will generate DOT files for visualizing the computation graph after +/// each analysis pass applied. +/// +/// \param x whether to debug IR graph analysis phase. +/// +func (config *Config) SwitchIrDebug(x bool) { + C.PD_ConfigSwitchIrDebug(config.c, cvtGoBoolToPD(x)) +} + +/// +/// \brief Turn on MKLDNN. +/// +func (config *Config) EnableMKLDNN() { + C.PD_ConfigEnableMKLDNN(config.c) +} + +/// +/// \brief Set the cache capacity of different input shapes for MKLDNN. +/// Default value 0 means not caching any shape. +/// Please see MKL-DNN Data Caching Design Document: +/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md +/// +/// \param capacity The cache capacity. +/// +func (config *Config) SetMkldnnCacheCapacity(capacity int32) { + C.PD_ConfigSetMkldnnCacheCapacity(config.c, C.int32_t(capacity)) +} + +/// +/// \brief A boolean state telling whether to use the MKLDNN. +/// +/// \return bool Whether to use the MKLDNN. +/// +func (config *Config) MkldnnEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigMkldnnEnabled(config.c)) +} + +/// +/// \brief Set the number of cpu math library threads. +/// +/// \param mathThreadsNum The number of cpu math library +/// threads. +/// +func (config *Config) SetCpuMathLibraryNumThreads(mathThreadsNum int) { + C.PD_ConfigSetCpuMathLibraryNumThreads(config.c, C.int32_t(mathThreadsNum)) +} + +/// +/// \brief An int state telling how many threads are used in the CPU math +/// library. +/// +/// \return int The number of threads used in the CPU math library. +/// +func (config *Config) CpuMathLibraryNumThreads() int32 { + return int32(C.PD_ConfigGetCpuMathLibraryNumThreads(config.c)) +} + +/// +/// \brief Transform the AnalysisConfig to NativeConfig. +/// +/// \return NativeConfig The NativeConfig transformed. +/// +// NativeConfig ToNativeConfig() const; + +/// +/// \brief Specify the operator type list to use MKLDNN acceleration. +/// +/// \param opList The operator type list. +/// +func (config *Config) SetMKLDNNOp(opList []string) { + num := uint(len(opList)) + // Add one in case num is zero. + var buf = make([]*C.char, num+1) + for i, _ := range opList { + char := C.CString(opList[i]) + defer C.free(unsafe.Pointer(char)) + buf[i] = (*C.char)(unsafe.Pointer(char)) + } + + C.PD_ConfigSetMkldnnOp(config.c, C.size_t(num), (**C.char)(unsafe.Pointer(&buf[0]))) +} + +/// +/// \brief Turn on MKLDNN quantization. +/// +func (config *Config) EnableMkldnnQuantizer() { + C.PD_ConfigEnableMkldnnQuantizer(config.c) +} + +/// +/// \brief Turn on MKLDNN bfloat16. +/// +func (config *Config) EnableMkldnnBfloat16() { + C.PD_ConfigEnableMkldnnBfloat16(config.c) +} + +/// +/// \brief A boolean state telling whether to use the MKLDNN Bfloat16. +/// +/// \return bool Whether to use the MKLDNN Bfloat16. +/// +func (config *Config) MkldnnBfloat16Enabled() bool { + return cvtPDBoolToGo(C.PD_ConfigMkldnnBfloat16Enabled(config.c)) +} + +/// \brief Specify the operator type list to use Bfloat16 acceleration. +/// +/// \param opList The operator type list. +/// +func (config *Config) SetBfloat16Op(opList []string) { + num := uint(len(opList)) + // Add one in case num is zero. + var buf = make([]*C.char, num+1) + for i, _ := range opList { + char := C.CString(opList[i]) + defer C.free(unsafe.Pointer(char)) + buf[i] = (*C.char)(unsafe.Pointer(char)) + } + + C.PD_ConfigSetBfloat16Op(config.c, C.size_t(num), (**C.char)(unsafe.Pointer(&buf[0]))) +} + +/// +/// \brief A boolean state telling whether the thread local CUDA stream is +/// enabled. +/// +/// \return bool Whether the thread local CUDA stream is enabled. +/// +func (config *Config) ThreadLocalStreamEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigThreadLocalStreamEnabled(config.c)) +} + +/// +/// \brief A boolean state telling whether the MKLDNN quantization is enabled. +/// +/// \return bool Whether the MKLDNN quantization is enabled. +/// +func (config *Config) MkldnnQuantizerEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigMkldnnQuantizerEnabled(config.c)) +} + +/// +/// \brief Specify the memory buffer of program and parameter. +/// Used when model and params are loaded directly from memory. +/// +/// \param prog The memory buffer of program. +/// \param params The memory buffer of the combined parameters file. +/// +func (config *Config) SetModelBuffer(prog, params string) { + cProg := C.CString(prog) + cParams := C.CString(params) + defer func() { + C.free(unsafe.Pointer(cProg)) + C.free(unsafe.Pointer(cParams)) + }() + + C.PD_ConfigSetModelBuffer(config.c, cProg, C.size_t(len(prog)), cParams, C.size_t(len(params))) +} + +/// +/// \brief A boolean state telling whether the model is set from the CPU +/// memory. +/// +/// \return bool Whether model and params are loaded directly from memory. +/// +func (config *Config) ModelFromMemory() bool { + return cvtPDBoolToGo(C.PD_ConfigModelFromMemory(config.c)) +} + +/// +/// \brief Turn on memory optimize +/// NOTE still in development. +/// +func (config *Config) EnableMemoryOptim() { + C.PD_ConfigEnableMemoryOptim(config.c) +} + +/// +/// \brief A boolean state telling whether the memory optimization is +/// activated. +/// +/// \return bool Whether the memory optimization is activated. +/// +func (config *Config) MemoryOptimEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigMemoryOptimEnabled(config.c)) +} + +/// +/// \brief Turn on profiling report. +/// If not turned on, no profiling report will be generated. +/// +func (config *Config) EnableProfile() { + C.PD_ConfigEnableProfile(config.c) +} + +/// +/// \brief A boolean state telling whether the profiler is activated. +/// +/// \return bool Whether the profiler is activated. +/// +func (config *Config) ProfileEnabled() bool { + return cvtPDBoolToGo(C.PD_ConfigProfileEnabled(config.c)) +} + +/// +/// \brief Mute all logs in Paddle inference. +/// +func (config *Config) DisableGlogInfo() { + C.PD_ConfigDisableGlogInfo(config.c) +} + +/// +/// \brief A boolean state telling whether logs in Paddle inference are muted. +/// +/// \return bool Whether logs in Paddle inference are muted. +/// +func (config *Config) GlogInfoDisabled() bool { + return cvtPDBoolToGo(C.PD_ConfigGlogInfoDisabled(config.c)) +} + +/// +/// \brief A boolean state telling whether the AnalysisConfig is valid. +/// +/// \return bool Whether the AnalysisConfig is valid. +/// +func (config *Config) IsValid() bool { + return cvtPDBoolToGo(C.PD_ConfigIsValid(config.c)) +} + +/// +/// \brief Enable the GPU multi-computing stream feature. +/// NOTE: The current behavior of this interface is to bind the computation +/// stream to the thread, and this behavior may be changed in the future. +/// +func (config *Config) EnableGpuMultiStream() { + C.PD_ConfigEnableGpuMultiStream(config.c) +} + +/// +/// \brief Delete all passes that has a certain type 'pass'. +/// +/// \param[in] pass the certain pass type to be deleted. +/// +func (config *Config) DeletePass(pass string) { + cPass := C.CString(pass) + C.PD_ConfigDeletePass(config.c, cPass) + C.free(unsafe.Pointer(cPass)) +} + +/// +/// \brief Append a pass to the end of the passes +/// +/// \param[in] pass the new pass. +/// +func (config *Config) AppendPass(pass string) { + cPass := C.CString(pass) + C.PD_ConfigAppendPass(config.c, cPass) + C.free(unsafe.Pointer(cPass)) +} + +/// +/// \brief Insert a pass to a specific position +/// +/// \param[in] idx the position to insert. +/// \param[in] pass the new pass. +/// +func (config *Config) InsertPass(idx uint64, pass string) { + cPass := C.CString(pass) + C.PD_ConfigInsertPass(config.c, C.size_t(idx), cPass) + C.free(unsafe.Pointer(cPass)) +} + +/// +/// \brief Get information of passes. +/// +/// \return Return list of the passes. +/// +func (config *Config) AllPasses() []string { + cPasses := C.PD_ConfigAllPasses(config.c) + num := int(cPasses.size) + passes := cvtToGoSliceString(num, cPasses.data) + C.PD_OneDimArrayCstrDestroy(cPasses) + return passes +} diff --git a/paddle/fluid/inference/goapi/config_test.go b/paddle/fluid/inference/goapi/config_test.go new file mode 100644 index 0000000000000000000000000000000000000000..e7b2c956a924ae201be3cbc9a8a299ab053d8142 --- /dev/null +++ b/paddle/fluid/inference/goapi/config_test.go @@ -0,0 +1,122 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package paddle + +import "testing" + +func TestNewConfig(t *testing.T) { + config := NewConfig() + config.SetProgFile("model") + config.SetParamsFile("params") + + config.SetOptimCacheDir("cache") + + config.DisableFCPadding() + t.Logf("UseFcPadding:%+v", config.UseFcPadding()) + + // It will break when we have no xpu env. + // config.EnableXpu(100) + // t.Logf("EnableXpu, UseXpu:%+v ", config.UseXpu()) + + config.SwitchIrOptim(true) + t.Logf("IrOptim:%+v", config.IrOptim()) + + config.EnableUseGpu(100, 0) + t.Logf("use_gpu:%+v, gpu_id:%+v", config.UseGpu(), config.GpuDeviceId()) + t.Logf("MemoryPoolInitSizeMb:%+v, FractionOfGpuMemoryForPool:%+v", config.MemoryPoolInitSizeMb(), config.FractionOfGpuMemoryForPool()) + + config.EnableTensorRtEngine(1024, 16, 3, PrecisionFloat32, false, false) + t.Logf("TensorRtEngineEnabled:%+v", config.TensorRtEngineEnabled()) + + minInputShape := map[string][]int32{ + "image": []int32{-1, 3, 100, 100}, + "shape": []int32{-1, 2}, + } + maxInputShape := map[string][]int32{ + "image": []int32{-1, 3, 608, 608}, + "shape": []int32{-1, 2}, + } + optInputShape := map[string][]int32{ + "image": []int32{-1, 3, 406, 406}, + "shape": []int32{-1, 2}, + } + config.SetTRTDynamicShapeInfo(minInputShape, maxInputShape, optInputShape, false) + + config.EnableTensorRtOSS() + t.Logf("TensorrtOssEnabled:%+v", config.TensorrtOssEnabled()) + + config.EnableTensorRtDLA(0) + t.Logf("TensorrtDlaEnabled:%+v", config.TensorrtDlaEnabled()) + + config.DisableTensorRtOPs([]string{"mul", "fc"}) + + config.EnableGpuMultiStream() + t.Logf("ThreadLocalStreamEnabled:%+v", config.ThreadLocalStreamEnabled()) + + config.SwitchIrDebug(false) + + config.EnableMKLDNN() + + config.EnableMemoryOptim() + t.Logf("MemoryOptimEnabled:%+v", config.MemoryOptimEnabled()) + + config.EnableProfile() + t.Logf("ProfileEnabled:%+v", config.ProfileEnabled()) + + config.DisableGlogInfo() + t.Logf("GlogInfoDisabled:%+v", config.GlogInfoDisabled()) + + t.Logf("IsValid:%+v", config.IsValid()) + + config.AppendPass("test_pass") + t.Logf("After AppendPass, AllPasses:%+v", config.AllPasses()) + + config.DeletePass("test_pass") + t.Logf("After DeletePass, AllPasses:%+v", config.AllPasses()) +} + +func TestLite(t *testing.T) { + config := NewConfig() + config.SetModel("model", "params") + t.Log(config.ProgFile()) + t.Log(config.ParamsFile()) + + config.EnableLiteEngine(PrecisionFloat32, true, []string{}, []string{}) + t.Logf("LiteEngineEnabled:%+v", config.LiteEngineEnabled()) +} + +func TestMkldnn(t *testing.T) { + config := NewConfig() + config.SetModelDir("modelDir") + t.Log(config.ModelDir()) + + config.EnableMKLDNN() + t.Logf("MkldnnEnabled:%+v", config.MkldnnEnabled()) + + config.SetMkldnnCacheCapacity(4) + + config.SetCpuMathLibraryNumThreads(4) + t.Logf("CpuMathLibraryNumThreads:%+v", config.CpuMathLibraryNumThreads()) + + config.SetMKLDNNOp([]string{"fc", "conv"}) + + config.EnableMkldnnQuantizer() + t.Logf("MkldnnQuantizerEnabled:%+v", config.MkldnnQuantizerEnabled()) + + config.EnableMkldnnBfloat16() + t.Logf("MkldnnBfloat16Enabled:%+v", config.MkldnnBfloat16Enabled()) + + config.SetBfloat16Op([]string{"fc", "mul"}) +} diff --git a/paddle/fluid/inference/goapi/go.mod b/paddle/fluid/inference/goapi/go.mod new file mode 100644 index 0000000000000000000000000000000000000000..96e04486f0ffbf2df33698606704db19507adcc8 --- /dev/null +++ b/paddle/fluid/inference/goapi/go.mod @@ -0,0 +1,3 @@ +module github.com/paddlepaddle/paddle/paddle/fluid/inference/goapi + +go 1.15 diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/inference/goapi/lib.go similarity index 54% rename from paddle/fluid/operators/increment_op.cu rename to paddle/fluid/inference/goapi/lib.go index 228063bf3d4b24bbd03649189f6ddba9a5f0ca30..b87561577714fe97a62b74645a7f7cfbb14dce06 100644 --- a/paddle/fluid/operators/increment_op.cu +++ b/paddle/fluid/inference/goapi/lib.go @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,11 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/increment_op.h" +package paddle -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - increment, ops::IncrementKernel, - ops::IncrementKernel, - ops::IncrementKernel, - ops::IncrementKernel); +// #cgo CFLAGS: -I${SRCDIR}/paddle_inference_c/paddle/include +// #cgo LDFLAGS: -L${SRCDIR}/paddle_inference_c/paddle/lib -lpaddle_inference_c +import "C" diff --git a/paddle/fluid/inference/goapi/predictor.go b/paddle/fluid/inference/goapi/predictor.go new file mode 100644 index 0000000000000000000000000000000000000000..fb8c8892b6676e210e6304ed6db076a3c20178d8 --- /dev/null +++ b/paddle/fluid/inference/goapi/predictor.go @@ -0,0 +1,166 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package paddle + +// #include "pd_predictor.h" +// #include "pd_tensor.h" +// #include "pd_common.h" +// #include "pd_types.h" +// #include "pd_utils.h" +// #include +// #include +import "C" +import ( + "runtime" + "unsafe" +) + +type Predictor struct { + c *C.PD_Predictor +} + +/// +/// \brief Create a new Predictor +/// +/// \param[in] Config config +/// \return new predicor. +/// +func NewPredictor(config *Config) *Predictor { + cPredictor := C.PD_PredictorCreate(config.c) + predictor := &Predictor{c: cPredictor} + runtime.SetFinalizer(predictor, func(predictor *Predictor) { + C.PD_PredictorDestroy(predictor.c) + }) + return predictor +} + +/// +/// \brief Clone a new Predictor +/// +/// \return new predictor. +/// +func (p *Predictor) Clone() *Predictor { + cPredictor := C.PD_PredictorClone(p.c) + predictor := &Predictor{c: cPredictor} + runtime.SetFinalizer(predictor, func(predictor *Predictor) { + C.PD_PredictorDestroy(predictor.c) + }) + return predictor +} + +/// +/// \brief Get the input number +/// +/// \return input number +/// +func (p *Predictor) GetInputNum() uint { + return uint(C.PD_PredictorGetInputNum(p.c)) +} + +/// +/// \brief Get the output number +/// +/// \return output number +/// +func (p *Predictor) GetOutputNum() uint { + return uint(C.PD_PredictorGetOutputNum(p.c)) +} + +/// +/// \brief Get the input names +/// +/// \return input names +/// +func (p *Predictor) GetInputNames() []string { + cNames := C.PD_PredictorGetInputNames(p.c) + numNames := int(cNames.size) + names := cvtToGoSliceString(numNames, cNames.data) + C.PD_OneDimArrayCstrDestroy(cNames) + return names +} + +/// +/// \brief Get the output names +/// +/// \return output names +/// +func (p *Predictor) GetOutputNames() []string { + cNames := C.PD_PredictorGetOutputNames(p.c) + numNames := int(cNames.size) + names := cvtToGoSliceString(numNames, cNames.data) + C.PD_OneDimArrayCstrDestroy(cNames) + return names +} + +/// +/// \brief Get the Input Tensor object +/// +/// \param[in] name input name +/// \return input tensor +/// +func (p *Predictor) GetInputHandle(name string) *Tensor { + cName := C.CString(name) + cHandle := C.PD_PredictorGetInputHandle(p.c, cName) + C.free(unsafe.Pointer(cName)) + handle := &Tensor{c: cHandle} + runtime.SetFinalizer(handle, func(handle *Tensor) { + C.PD_TensorDestroy(handle.c) + }) + return handle +} + +/// +/// \brief Get the Output Tensor object +/// +/// \param[in] name output name +/// \return output tensor +/// +func (p *Predictor) GetOutputHandle(name string) *Tensor { + cName := C.CString(name) + cHandle := C.PD_PredictorGetOutputHandle(p.c, cName) + C.free(unsafe.Pointer(cName)) + handle := &Tensor{c: cHandle} + runtime.SetFinalizer(handle, func(handle *Tensor) { + C.PD_TensorDestroy(handle.c) + }) + return handle +} + +/// +/// \brief Run the prediction engine +/// +func (p *Predictor) Run() { + C.PD_PredictorRun(p.c) +} + +/// +/// \brief Clear the intermediate tensors of the predictor +/// +func (p *Predictor) ClearIntermediateTensor() { + C.PD_PredictorClearIntermediateTensor(p.c) +} + +/// +/// \brief Release all tmp tensor to compress the size of the memory pool. +/// The memory pool is considered to be composed of a list of chunks, if +/// the chunk is not occupied, it can be released. +/// +/// \return Number of bytes released. It may be smaller than the actual +/// released memory, because part of the memory is not managed by the +/// MemoryPool. +/// +func (p *Predictor) TryShrinkMemory() { + C.PD_PredictorTryShrinkMemory(p.c) +} diff --git a/paddle/fluid/inference/goapi/predictor_test.go b/paddle/fluid/inference/goapi/predictor_test.go new file mode 100644 index 0000000000000000000000000000000000000000..a5df1048ca2a56901dd8203affbed3ed36b2a075 --- /dev/null +++ b/paddle/fluid/inference/goapi/predictor_test.go @@ -0,0 +1,115 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package paddle + +import ( + "io/ioutil" + "os" + "testing" +) + +func TestNewPredictor(t *testing.T) { + t.Logf("Version:\n%+v", Version()) + config := NewConfig() + config.SetModel("./mobilenetv1/inference.pdmodel", "./mobilenetv1/inference.pdiparams") + config.EnableUseGpu(100, 0) + predictor := NewPredictor(config) + inNames := predictor.GetInputNames() + t.Logf("InputNames:%+v", inNames) + outNames := predictor.GetOutputNames() + t.Logf("OutputNames:%+v", outNames) + + inHandle := predictor.GetInputHandle(inNames[0]) + inHandle.Reshape([]int32{1, 3, 224, 224}) + t.Logf("inHandle name:%+v, shape:%+v", inHandle.Name(), inHandle.Shape()) + + var lod [][]uint + lod = append(lod, []uint{0, 1, 2}) + lod = append(lod, []uint{1, 2, 3, 4}) + inHandle.SetLod(lod) + t.Logf("inHandle Lod:%+v", inHandle.Lod()) + data := make([]float32, numElements([]int32{1, 3, 224, 224})) + for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ { + data[i] = float32(i%255) * 0.1 + } + inHandle.CopyFromCpu(data) + t.Logf("inHandle Type:%+v", inHandle.Type()) + + predictor.Run() + + outHandle := predictor.GetOutputHandle(outNames[0]) + t.Logf("outHandle name:%+v", outHandle.Name()) + + outShape := outHandle.Shape() + t.Logf("outHandle Shape:%+v", outShape) + outData := make([]float32, numElements(outShape)) + outHandle.CopyToCpu(outData) + t.Log(outData) + + cloned := predictor.Clone() + t.Logf("InputNum:%+v", cloned.GetInputNum()) + t.Logf("OutputNum:%+v", cloned.GetInputNum()) + cloned.ClearIntermediateTensor() +} + +func TestFromBuffer(t *testing.T) { + modelFile, err := os.Open("./mobilenetv1/inference.pdmodel") + if err != nil { + t.Fatal(err) + } + paramsFile, err := os.Open("./mobilenetv1/inference.pdiparams") + if err != nil { + t.Fatal(err) + } + defer modelFile.Close() + defer paramsFile.Close() + + model, err := ioutil.ReadAll(modelFile) + if err != nil { + t.Fatal(err) + } + params, err := ioutil.ReadAll(paramsFile) + if err != nil { + t.Fatal(err) + } + config := NewConfig() + config.SetModelBuffer(string(model), string(params)) + + predictor := NewPredictor(config) + inNames := predictor.GetInputNames() + outNames := predictor.GetOutputNames() + inHandle := predictor.GetInputHandle(inNames[0]) + inHandle.Reshape([]int32{1, 3, 224, 224}) + data := make([]float32, numElements([]int32{1, 3, 224, 224})) + for i := 0; i < int(numElements([]int32{1, 3, 224, 224})); i++ { + data[i] = float32(i%255) * 0.1 + } + inHandle.CopyFromCpu(data) + predictor.Run() + outHandle := predictor.GetOutputHandle(outNames[0]) + outShape := outHandle.Shape() + t.Logf("outHandle Shape:%+v", outShape) + outData := make([]float32, numElements(outShape)) + outHandle.CopyToCpu(outData) + t.Log(outData) +} + +func numElements(shape []int32) int32 { + n := int32(1) + for _, v := range shape { + n *= v + } + return n +} diff --git a/paddle/fluid/inference/goapi/tensor.go b/paddle/fluid/inference/goapi/tensor.go new file mode 100644 index 0000000000000000000000000000000000000000..b4ad1d8f766c7596d6fc767040428ba468736649 --- /dev/null +++ b/paddle/fluid/inference/goapi/tensor.go @@ -0,0 +1,240 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package paddle + +// #include "pd_tensor.h" +// #include "pd_utils.h" +// #include "pd_types.h" +// #include "pd_common.h" +// #include "stdlib.h" +import "C" +import ( + "fmt" + "reflect" + "unsafe" +) + +type DataType C.PD_DataType + +const ( + Unk DataType = C.PD_DATA_UNK + Float32 DataType = C.PD_DATA_FLOAT32 + Int32 DataType = C.PD_DATA_INT32 + Int64 DataType = C.PD_DATA_INT64 + Uint8 DataType = C.PD_DATA_UINT8 + Int8 DataType = C.PD_DATA_INT8 +) + +type PlaceType C.PD_PlaceType + +const ( + UnkPlace PlaceType = C.PD_PLACE_UNK + CpuPlace PlaceType = C.PD_PLACE_CPU + GpuPlace PlaceType = C.PD_PLACE_GPU + XpuPlace PlaceType = C.PD_PLACE_XPU +) + +type Tensor struct { + c *C.PD_Tensor +} + +/// +/// \brief Reset the shape of the tensor. +/// Generally it's only used for the input tensor. +/// +/// \param[in] shape The shape to set. +/// +func (t *Tensor) Reshape(shape []int32) { + C.PD_TensorReshape(t.c, C.size_t(len(shape)), (*C.int32_t)(unsafe.Pointer(&shape[0]))) +} + +/// +/// \brief Get the tensor shape +/// +/// \return The tensor shape. +/// +func (t *Tensor) Shape() []int32 { + cData := C.PD_TensorGetShape(t.c) + length := int(cData.size) + defer C.PD_OneDimArrayInt32Destroy(cData) + return cvtToGoSliceInt32(length, cData.data) +} + +/// +/// \brief Set the tensor lod information +/// \param[in] pd_tensor tensor. +/// \param[in] lod lod information. +/// +func (t *Tensor) SetLod(lod [][]uint) { + cLod := (*C.struct_PD_TwoDimArraySize)(C.malloc(C.size_t(C.sizeof_struct_PD_TwoDimArraySize))) + length := len(lod) + cLod.size = C.size_t(uint(length)) + var lodList = make([]*C.struct_PD_OneDimArraySize, length+1) + + for i, v := range lod { + oneDimArray := (*C.struct_PD_OneDimArraySize)(C.malloc(C.size_t(C.sizeof_struct_PD_OneDimArraySize))) + defer C.free(unsafe.Pointer(oneDimArray)) + tmpLength := len(v) + oneDimArray.size = C.size_t(uint(tmpLength)) + + tmpC := (*C.size_t)(C.malloc(C.size_t(C.sizeof_size_t * tmpLength))) + defer C.free(unsafe.Pointer(tmpC)) + tmpSlice := (*[1 << 27]C.size_t)(unsafe.Pointer(tmpC))[:tmpLength:tmpLength] + for j, w := range v { + tmpSlice[j] = C.size_t(w) + } + oneDimArray.data = tmpC + + lodList[i] = oneDimArray + } + cLod.data = (**C.struct_PD_OneDimArraySize)(unsafe.Pointer(&lodList[0])) + C.PD_TensorSetLod(t.c, cLod) + C.free(unsafe.Pointer(cLod)) + // C.PD_TwoDimArraySizeDestroy(cLod) +} + +/// +/// \brief Get the tensor lod information +/// +/// \return the lod information. +/// +func (t *Tensor) Lod() [][]uint { + cLod := C.PD_TensorGetLod(t.c) + length := int(cLod.size) + res := make([][]uint, length) + if length == 0 { + return res + } + cLodSlice := (*[1 << 27]*C.struct_PD_OneDimArraySize)(unsafe.Pointer(cLod.data))[:length:length] + + for i := 0; i < length; i++ { + size := uint(cLodSlice[i].size) + lod := make([]uint, size) + + tmpSlice := (*[1 << 27]C.size_t)(unsafe.Pointer(cLodSlice[i].data))[:size:size] + for j, v := range tmpSlice { + lod[j] = uint(v) + } + + res[i] = lod + } + + C.PD_TwoDimArraySizeDestroy(cLod) + return res +} + +/// +/// \brief Get the tensor data type +/// \param[in] pd_tensor tensor. +/// \return the tensor data type. +/// +func (t *Tensor) Type() DataType { + cDtype := C.PD_TensorGetDataType(t.c) + return DataType(cDtype) +} + +/// +/// \brief Get the tensor name +/// +/// \return the tensor name. +/// +func (t *Tensor) Name() string { + return C.GoString(C.PD_TensorGetName(t.c)) +} + +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// +/// \param[in] value +/// +func (t *Tensor) CopyFromCpu(value interface{}) { + val := reflect.ValueOf(value) + dtype, _ := dataTypeOf(val) + + switch dtype { + case Float32: + data := val.Interface().([]float32) + C.PD_TensorCopyFromCpuFloat(t.c, (*C.float)(unsafe.Pointer(&data[0]))) + case Int32: + data := val.Interface().([]int32) + C.PD_TensorCopyFromCpuInt32(t.c, (*C.int32_t)(unsafe.Pointer(&data[0]))) + case Int64: + data := val.Interface().([]int64) + C.PD_TensorCopyFromCpuInt64(t.c, (*C.int64_t)(unsafe.Pointer(&data[0]))) + case Uint8: + data := val.Interface().([]uint8) + C.PD_TensorCopyFromCpuUint8(t.c, (*C.uint8_t)(unsafe.Pointer(&data[0]))) + case Int8: + data := val.Interface().([]int8) + C.PD_TensorCopyFromCpuInt8(t.c, (*C.int8_t)(unsafe.Pointer(&data[0]))) + } +} + +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// +/// \param[value] data The tensor will copy the data to the address. +/// +func (t *Tensor) CopyToCpu(value interface{}) { + val := reflect.ValueOf(value) + dtype, _ := dataTypeOf(val) + + switch dtype { + case Float32: + data := val.Interface().([]float32) + C.PD_TensorCopyToCpuFloat(t.c, (*C.float)(unsafe.Pointer(&data[0]))) + case Int32: + data := val.Interface().([]int32) + C.PD_TensorCopyToCpuInt32(t.c, (*C.int32_t)(unsafe.Pointer(&data[0]))) + case Int64: + data := val.Interface().([]int64) + C.PD_TensorCopyToCpuInt64(t.c, (*C.int64_t)(unsafe.Pointer(&data[0]))) + case Uint8: + data := val.Interface().([]uint8) + C.PD_TensorCopyToCpuUint8(t.c, (*C.uint8_t)(unsafe.Pointer(&data[0]))) + case Int8: + data := val.Interface().([]int8) + C.PD_TensorCopyToCpuInt8(t.c, (*C.int8_t)(unsafe.Pointer(&data[0]))) + } +} + +var types = []struct { + typ reflect.Type + dataType C.PD_DataType +}{ + {reflect.TypeOf(float32(0)), C.PD_DATA_FLOAT32}, + {reflect.TypeOf(int32(0)), C.PD_DATA_INT32}, + {reflect.TypeOf(int64(0)), C.PD_DATA_INT64}, + {reflect.TypeOf(uint8(0)), C.PD_DATA_UINT8}, + {reflect.TypeOf(int8(0)), C.PD_DATA_INT8}, +} + +func dataTypeOf(val reflect.Value) (dt DataType, err error) { + typ := val.Type() + for typ.Kind() == reflect.Array || typ.Kind() == reflect.Slice { + if val.Len() > 0 { + val = val.Index(0) + } + typ = typ.Elem() + } + for _, t := range types { + if typ.Kind() == t.typ.Kind() { + return DataType(t.dataType), nil + } + } + return dt, fmt.Errorf("unsupported type %v", typ) +} diff --git a/paddle/fluid/inference/goapi/test.sh b/paddle/fluid/inference/goapi/test.sh new file mode 100644 index 0000000000000000000000000000000000000000..b764e2ac72c70e7689af6828c69d0a7bcb716d5b --- /dev/null +++ b/paddle/fluid/inference/goapi/test.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# 1. download the mobilenetv1 model to test config and predictor +if [ ! -d mobilenetv1 ]; then + wget https://paddle-inference-dist.bj.bcebos.com/Paddle-Inference-Demo/mobilenetv1.tgz + tar xzf mobilenetv1.tgz +fi + +# 2. set LD_LIBRARY_PATH +export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:$PWD/paddle_inference_c/paddle/lib + +# 3. go test +go test -v ./... diff --git a/paddle/fluid/inference/goapi/utils.go b/paddle/fluid/inference/goapi/utils.go new file mode 100644 index 0000000000000000000000000000000000000000..fca5298baf9e29637b99b66f5fd1fedd9d55cb16 --- /dev/null +++ b/paddle/fluid/inference/goapi/utils.go @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package paddle + +// #include +// #include +import "C" +import ( + "unsafe" +) + +func cvtPDBoolToGo(b C.int8_t) bool { + var cFalse C.int8_t + if b != cFalse { + return true + } + return false +} + +func cvtGoBoolToPD(b bool) C.int8_t { + if b == false { + return 0 + } + return 1 +} + +func cvtToGoSliceString(length int, str **C.char) []string { + if str == nil { + return nil + } + tmpSlice := (*[1 << 27]*C.char)(unsafe.Pointer(str))[:length:length] + goStrings := make([]string, length) + for i, s := range tmpSlice { + goStrings[i] = C.GoString(s) + } + return goStrings +} + +func cvtToGoSliceInt32(length int, data *C.int32_t) []int32 { + if data == nil { + return nil + } + tmpSlice := (*[1 << 27]C.int32_t)(unsafe.Pointer(data))[:length:length] + res := make([]int32, length) + for i, s := range tmpSlice { + res[i] = int32(s) + } + return res +} diff --git a/go/paddle/common.go b/paddle/fluid/inference/goapi/version.go similarity index 50% rename from go/paddle/common.go rename to paddle/fluid/inference/goapi/version.go index cbbde6a45f59b80931a3a2c501581819085e8ea7..74b74dd501a00c106c6cc510c09475b9cb31e2c0 100644 --- a/go/paddle/common.go +++ b/paddle/fluid/inference/goapi/version.go @@ -1,4 +1,4 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,29 +14,13 @@ package paddle -// #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c -// #include -// #include +// #include "pd_common.h" +// #include "pd_predictor.h" +// #include "pd_types.h" +// #include "pd_utils.h" import "C" -import "fmt" -func ConvertCBooleanToGo(b C.bool) bool { - var c_false C.bool - if b != c_false { - return true - } - return false -} - -func numel(shape []int32) int32 { - n := int32(1) - for _, d := range shape { - n *= d - } - return n -} - -func bug(format string, args ...interface{}) error { - return fmt.Errorf("Bug %v", fmt.Sprintf(format, args...)) +func Version() string { + cVersion := C.PD_GetVersion() + return C.GoString(cVersion) } diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 3820ac5d7cc24693c388554acea0aad6ab49b83a..2e4a175566a7a100749d14c712e8ef9a89eb6019 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -12,6 +12,9 @@ nv_library(tensorrt_converter affine_channel_op.cc multiclass_nms_op.cc nearest_interp_op.cc + reshape_op.cc + reduce_op.cc + gather_nd_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 9244b9af0bbd6cfc392b1b940d81c04b0dd0cde9..e6a0ecf4aececcba012923f631b2dcfd8f69743d 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -52,11 +52,6 @@ class ActivationOpConverter : public OpConverter { engine_->GetITensor(op_desc.Input("X")[0]); auto op_pair = ops.find(op_type_); - if (op_pair == ops.end()) { - PADDLE_THROW(platform::errors::Fatal( - "Wrong activation op type, the trt do not support the %s act type.", - op_type_)); - } nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc index 813342c08483b7e9124929d3f00d8155d337e67e..eba67c3c098ca60b7608ecf6db50b46e233955a5 100644 --- a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc @@ -55,16 +55,6 @@ class AffineChannelOpConverter : public OpConverter { auto* bias_t = bias_v->GetMutable(); float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false); - auto data_layout = framework::StringToDataLayout( - BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); - - PADDLE_ENFORCE_EQ( - data_layout, framework::DataLayout::kNCHW, - platform::errors::InvalidArgument( - "TensorRT affine channel converter can only convert NCHW format. " - "Other format should be run in fluid mode. Report a bug on github " - "issue if you see this line.")); - // tensorrt scalend layer only support spatial dims >= 2, // so nhwc is not availabe (spatial dims == 0) const int channel_axis = engine_->with_dynamic_shape(); diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index ba47358b147db234b4ad77ba88dec3f55d75c1e5..6bbda6bb29aadbfcf4974e2db5eac65a027a19a5 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -103,11 +103,18 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, static_cast(bias_data), bias_size}; - auto* layer = fadd_layer(const_cast(X), n_output, n_input, - nv_ksize, weight, bias); - PADDLE_ENFORCE_NOT_NULL(layer, - platform::errors::Fatal("TensorRT create conv2d" - " layer error.")); + // In conv2d_transpose and depthwise_conv2d_transpose, + // output channels = filter_dims[1] * groups + auto* layer = (op_desc.Type() == "conv2d_transpose" || + op_desc.Type() == "depthwise_conv2d_transpose") + ? fadd_layer(const_cast(X), + n_input * groups, nv_ksize, weight, bias) + : fadd_layer(const_cast(X), n_output, + nv_ksize, weight, bias); + + PADDLE_ENFORCE_NOT_NULL( + layer, platform::errors::Fatal("TensorRT create conv2d/conv2d_transpose" + " layer failed.")); layer->setStride(nv_strides); layer->setPadding(nv_paddings); layer->setNbGroups(groups); @@ -134,7 +141,6 @@ class Conv2dOpConverter : public OpConverter { ConvertConv2d( engine_, op, scope, test_mode, [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */ - int n_input, /* Conv input maps */ nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* { auto* layer = @@ -156,11 +162,10 @@ class Deconv2dOpConverter : public OpConverter { ConvertConv2d( engine_, op, scope, test_mode, [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */ - int n_input, /* Deconv output maps */ nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* { auto* layer = - TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input, + TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_output, ksize, weight.get(), bias.get()); return layer; }, diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 5419933e4073673f56c72d06c49f488167421dbe..2f802ea8d181ea26e257fcba53f584a0df2f55f0 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -62,6 +62,25 @@ class ElementwiseWeightOpConverter : public OpConverter { 0}; TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; + + nvinfer1::IShuffleLayer* expand_layer = nullptr; + nvinfer1::IShuffleLayer* squeeze_layer = nullptr; + int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0; + auto input_dim = X->getDimensions(); + if (input_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims expand_shape; + expand_shape.nbDims = 3 + dynamic_shape_offset; + for (int i = 0; i < expand_shape.nbDims; i++) { + if (i < input_dim.nbDims) { + expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i]; + } else { + expand_shape.d[i] = 1; + } + } + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + expand_layer->setReshapeDimensions(expand_shape); + X = expand_layer->getOutput(0); + } if (op_type_ == "add") { nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( engine_, Scale, *X, scale_mode, shift_weights.get(), @@ -73,7 +92,17 @@ class ElementwiseWeightOpConverter : public OpConverter { shift_weights.get(), power_weights.get()); layer = scale_layer; } - + if (input_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims squeeze_shape; + squeeze_shape.nbDims = input_dim.nbDims; + for (int i = 0; i < squeeze_shape.nbDims; i++) { + squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i]; + } + squeeze_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); + squeeze_layer->setReshapeDimensions(squeeze_shape); + layer = static_cast(squeeze_layer); + } auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name}, test_mode); @@ -222,10 +251,10 @@ class ElementwiseTensorOpConverter : public OpConverter { } else { plugin::ElementWisePlugin* plugin = new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); - plugin->AddInput(X); - plugin->AddInput(Y); - nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin( - plugin->GetInputs().data(), 2, + + std::vector inputs{X, Y}; + auto* plugin_layer = engine_->AddPlugin( + inputs.data(), inputs.size(), reinterpret_cast(plugin)); layer = plugin_layer; diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 66a682db07b91195046d3d11031b8739b72b81c4..18bbd1d2b770348ef2d051ab0a7c3602bd02dd09 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -36,14 +36,25 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); auto word_id_name = op_desc.Input("WordId").front(); auto pos_id_name = op_desc.Input("PosId").front(); + engine_->Set("ernie_pos_name", new std::string(pos_id_name)); + auto sent_id_name = op_desc.Input("SentId").front(); auto word_emb_name = op_desc.Input("WordEmbedding").front(); auto pos_emb_name = op_desc.Input("PosEmbedding").front(); auto sent_emb_name = op_desc.Input("SentEmbedding").front(); - std::vector id_names = {word_id_name, pos_id_name, - sent_id_name}; - std::vector emb_names = {word_emb_name, pos_emb_name, - sent_emb_name}; + + std::vector id_names; + std::vector emb_names; + + if (engine_->use_oss()) { + id_names = + std::vector{word_id_name, pos_id_name, sent_id_name}; + emb_names = + std::vector{word_emb_name, pos_emb_name, sent_emb_name}; + } else { + id_names = op_desc.Input("Ids"); + emb_names = op_desc.Input("Embs"); + } int input_num = id_names.size(); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 6167e68df2b6731eddbfae03aca3c30f2575ae40..74bb854e55f8231042fb014817a81dfa647c7e7b 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -37,7 +37,7 @@ class FcOpConverter : public OpConverter { const framework::Scope& scope, bool test_mode) override { VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; framework::OpDesc op_desc(op, nullptr); - + auto output_name = op_desc.Output("Out").front(); auto input_names = op_desc.InputNames(); bool with_bias = input_names.size() >= 3; std::string w_name = "Y"; @@ -48,13 +48,14 @@ class FcOpConverter : public OpConverter { } // Declare inputs auto* X = engine_->GetITensor(op_desc.Input(i_name).front()); + auto x_dim = X->getDimensions(); // Declare weights auto* Y_v = scope.FindVar(op_desc.Input(w_name).front()); PADDLE_ENFORCE_NOT_NULL( Y_v, platform::errors::NotFound( "Can not find %s presistale var of fc in scope.", w_name)); auto* Y_t = Y_v->GetMutable(); - const int x_num_col_dims = + int x_num_col_dims = op_desc.HasAttr("x_num_col_dims") ? BOOST_GET_CONST(int, op_desc.GetAttr("x_num_col_dims")) : (op_desc.HasAttr("in_num_col_dims") @@ -106,8 +107,8 @@ class FcOpConverter : public OpConverter { auto regist_fc = [&](nvinfer1::ITensor* inputs, int n_output, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) { - nvinfer1::ILayer* fc_layer = nullptr; if (enable_int8) { + // add conv layer PADDLE_ENFORCE_EQ( op_desc.HasAttr("out_threshold"), true, platform::errors::InvalidArgument( @@ -115,22 +116,52 @@ class FcOpConverter : public OpConverter { float out_scale = BOOST_GET_CONST(float, op_desc.GetAttr("out_threshold")); nvinfer1::DimsHW nv_ksize(1, 1); - fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, - nv_ksize, weight.get(), bias.get()); - engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); - } else { - fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, - n_output, weight.get(), bias.get()); - } - - auto output_name = op_desc.Output("Out").front(); - if (activation_type == "relu") { - nvinfer1::IActivationLayer* relu_layer = - TRT_ENGINE_ADD_LAYER(engine_, Activation, *(fc_layer->getOutput(0)), - nvinfer1::ActivationType::kRELU); - RreplenishLayerAndOutput(relu_layer, "fc", {output_name}, test_mode); + auto* fc_layer_int8 = + TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, + nv_ksize, weight.get(), bias.get()); + engine_->SetTensorDynamicRange(fc_layer_int8->getOutput(0), out_scale); + if (activation_type == "relu") { + nvinfer1::IActivationLayer* relu_layer_int8 = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(fc_layer_int8->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer_int8, "relu_after_fc_shuffle", + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(fc_layer_int8, "shuffle_after_fc", + {output_name}, test_mode); + } } else { - RreplenishLayerAndOutput(fc_layer, "fc", {output_name}, test_mode); + // add fc layer + auto* fc_layer_before = + TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *inputs, n_output, + weight.get(), bias.get()); + fc_layer_before->setName( + ("fc_layer_before(Output: " + output_name + ")").c_str()); + // add shuffle after fc + nvinfer1::Dims reshape_after_fc_dim; + if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 && + x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 1) { + // If use tensorrt'oss, the x_dim and x_num_col_dims need change + reshape_after_fc_dim.nbDims = 4; + } else { + reshape_after_fc_dim.nbDims = x_num_col_dims + 1; + } + for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) { + reshape_after_fc_dim.d[i] = 0; + } + auto* fc_layer_float = TRT_ENGINE_ADD_LAYER( + engine_, Shuffle, *fc_layer_before->getOutput(0)); + fc_layer_float->setReshapeDimensions(reshape_after_fc_dim); + if (activation_type == "relu") { + nvinfer1::IActivationLayer* relu_layer_float = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(fc_layer_float->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer_float, "relu_after_fc_shuffle", + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(fc_layer_float, "shuffle_after_fc", + {output_name}, test_mode); + } } }; @@ -157,153 +188,47 @@ class FcOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_num)}; - if (engine_->with_dynamic_shape()) { - // not NCHW layout, but NLP layout with added 'x 1 x 1' - auto x_dim = X->getDimensions(); - if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 && - x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) { - // fc which is just after self attention - regist_fc(X, n_output, weight, bias); - return; - } - PADDLE_ENFORCE_LE( - x_dim.nbDims - x_num_col_dims, 3, - platform::errors::InvalidArgument( - "Params and input dims mismatch. Paddle-TRT FC " - "converter expects x_dim.nbDims - x_num_col_dims <= 3, but " - "x_dim.nbDims = %d, x_num_col_dims = %d.", - x_dim.nbDims, x_num_col_dims)); - auto output_name = op_desc.Output("Out").front(); - // add shuffle before fc - nvinfer1::Dims reshape_before_fc_dim; - // padding shape "x 1 x 1" - int padding_length = 3 - (x_dim.nbDims - x_num_col_dims); - reshape_before_fc_dim.nbDims = x_dim.nbDims + padding_length; - int cur_dim_index = reshape_before_fc_dim.nbDims - 1; - while (padding_length-- > 0) { - reshape_before_fc_dim.d[cur_dim_index--] = 1; - } - while (cur_dim_index >= 0) { - reshape_before_fc_dim.d[cur_dim_index--] = 0; - } - - auto* reshape_before_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); - reshape_before_fc_layer->setName( - ("shuffle_before_fc(Output: " + output_name + ")").c_str()); - - // add fc layer - auto* fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), - n_output, weight.get(), bias.get()); - fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str()); - - // add shuffle after fc - nvinfer1::Dims reshape_after_fc_dim; - reshape_after_fc_dim.nbDims = x_num_col_dims + 1; - for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) { - reshape_after_fc_dim.d[i] = 0; - } - - auto* reshape_after_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); - reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); - - if (activation_type == "relu") { - reshape_after_fc_layer->setName( - ("shuffle_after_fc(Output: " + output_name + ")").c_str()); - nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER( - engine_, Activation, *(reshape_after_fc_layer->getOutput(0)), - nvinfer1::ActivationType::kRELU); - RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle", - {output_name}, test_mode); - } else { - RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc", - {output_name}, test_mode); - } - return; + // Running the TRT Static Shape mode: x_num_col_dims-1 + if (!engine_->with_dynamic_shape()) { + x_num_col_dims--; } - // in order to handle situations in NLP models(input dims < 3, - // x_num_col_dims != 1, etc.), reshape input to perform FC correctly. - auto* reshape_itensor = X; - int input_dims = X->getDimensions().nbDims; - auto input_d = X->getDimensions().d; - int reshape_dim3[3] = {0}; - int reshape_dim4[4] = {0}; - PADDLE_ENFORCE_LE(x_num_col_dims, input_dims, - platform::errors::InvalidArgument( - "Params and input dims mismatch. Paddle-TRT FC " - "converter expects x_num_col_dims <= input dims")); - if (x_num_col_dims == 1) { - if (input_dims == 4) { - PADDLE_ENFORCE_EQ( - input_d[3], 1, - platform::errors::InvalidArgument( - "Invalid dimensions. When x_num_col_dims equals to 1 and input " - "dims equals to 4, the last dim of input must be 1, but got %d", - input_d[3])); - } - if (enable_int8) { - reshape_dim3[0] = 1; - for (int i = 0; i < 3; i++) { - reshape_dim3[0] *= input_d[i]; - if (i > 0) { - reshape_dim3[i] = 1; - } - } - } else { - for (int i = 0; i < 3; i++) { - if (i < input_dims) { - reshape_dim3[i] = input_d[i]; - } else { - reshape_dim3[i] = 1; - } - } - } - - nvinfer1::Dims3 reshape_dim(reshape_dim3[0], reshape_dim3[1], - reshape_dim3[2]); - auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_layer->setReshapeDimensions(reshape_dim); - reshape_itensor = reshape_layer->getOutput(0); - if (enable_int8) { - engine_->SetTensorDynamicRange(reshape_itensor, in_scale); - } - } else { - PADDLE_ENFORCE_NE(input_dims, 1, - platform::errors::InvalidArgument( - "Invalid dimensions. When x_num_col_dims equals to " - "2, input_dims should not be 1")); - - if (enable_int8) { - for (int i = 0; i < 4; i++) { - if (i == 0) { - reshape_dim4[i] = input_d[i]; - } else { - reshape_dim4[i] = 1; - if (i < input_dims) { - reshape_dim4[1] *= input_d[i]; - } - } - } + // If use tensorrt'oss, the x_dim and x_num_col_dims need change + if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 && + x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) { + x_num_col_dims = 1; + } + PADDLE_ENFORCE_GT( + x_dim.nbDims, x_num_col_dims, + platform::errors::InvalidArgument( + "Params and input dims mismatch. Paddle-TRT FC " + "converter expects x_dim.nbDims > x_num_col_dims, but " + "x_dim.nbDims : %d, x_num_col_dims : %d.", + x_dim.nbDims, x_num_col_dims)); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = x_num_col_dims + 3; + // padding shape "* x q x 1 x 1" + for (int i = 0; i < reshape_before_fc_dim.nbDims; i++) { + reshape_before_fc_dim.d[i] = 1; + } + for (int i = 0; i < x_dim.nbDims; i++) { + if (i < x_num_col_dims) { + reshape_before_fc_dim.d[i] = 0; } else { - for (int i = 0; i < 4; i++) { - if (i < input_dims) { - reshape_dim4[i] = input_d[i]; - } else { - reshape_dim4[i] = 1; - } + if (x_dim.d[i] < 0) { + reshape_before_fc_dim.d[x_num_col_dims] = -1; + break; } + reshape_before_fc_dim.d[x_num_col_dims] *= x_dim.d[i]; } - nvinfer1::Dims4 reshape_dim(reshape_dim4[0], reshape_dim4[1], - reshape_dim4[2], reshape_dim4[3]); - auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_layer->setReshapeDimensions(reshape_dim); - reshape_itensor = reshape_layer->getOutput(0); - if (enable_int8) { - engine_->SetTensorDynamicRange(reshape_itensor, in_scale); - } + } + auto* reshape_before_fc_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_fc(Output: " + output_name + ")").c_str()); + auto* reshape_itensor = reshape_before_fc_layer->getOutput(0); + if (enable_int8) { + engine_->SetTensorDynamicRange(reshape_itensor, in_scale); } regist_fc(reshape_itensor, n_output, weight, bias); } diff --git a/paddle/fluid/inference/tensorrt/convert/flatten_op.cc b/paddle/fluid/inference/tensorrt/convert/flatten_op.cc index 03a1c1672469eca959dc08800b248f96ef165b13..25351cc10ec11b733c745522499a637129d399a2 100644 --- a/paddle/fluid/inference/tensorrt/convert/flatten_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/flatten_op.cc @@ -25,7 +25,7 @@ namespace inference { namespace tensorrt { /* - * FlattenOp, only support static shape mode currently. + * FlattenOp trt converter */ class FlattenOpConverter : public OpConverter { public: @@ -35,21 +35,57 @@ class FlattenOpConverter : public OpConverter { // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("X")[0]); int dims = input->getDimensions().nbDims; - - int dim_prod = 1; - for (int i = 0; i < dims; i++) { - int dim_i = input->getDimensions().d[i]; - PADDLE_ENFORCE_GT( - dim_i, 0, platform::errors::InvalidArgument( - "flatten input dim should be > 0, but got %d.", dim_i)); - dim_prod *= dim_i; + nvinfer1::IShuffleLayer* layer = nullptr; + if (!engine_->with_dynamic_shape()) { + int dim_prod = 1; + for (int i = 0; i < dims; i++) { + int dim_i = input->getDimensions().d[i]; + PADDLE_ENFORCE_GT( + dim_i, 0, + platform::errors::InvalidArgument( + "flatten input dim should be > 0, but got %d.", dim_i)); + dim_prod *= dim_i; + } + nvinfer1::Dims flatten_dim; + flatten_dim.nbDims = 1; + flatten_dim.d[0] = dim_prod; + layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + layer->setReshapeDimensions(flatten_dim); + } else { + auto* shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input); + nvinfer1::Dims start_dim, size_dim, stride_dim; + start_dim.nbDims = 1; + size_dim.nbDims = 1; + stride_dim.nbDims = 1; + start_dim.d[0] = 1; + size_dim.d[0] = dims - 1; + stride_dim.d[0] = 1; + auto* slice_layer = + TRT_ENGINE_ADD_LAYER(engine_, Slice, *(shape_layer->getOutput(0)), + start_dim, size_dim, stride_dim); + uint32_t reduce_dim = 1; + auto* reduce_prod_layer = TRT_ENGINE_ADD_LAYER( + engine_, Reduce, *(slice_layer->getOutput(0)), + nvinfer1::ReduceOperation::kPROD, reduce_dim, true); + int32_t* constant_weight_data = new int32_t[1]; + constant_weight_data[0] = -1; + TensorRTEngine::Weight constant_weight{ + nvinfer1::DataType::kINT32, static_cast(constant_weight_data), + 1}; + nvinfer1::Dims constant_dims; + constant_dims.nbDims = 1; + constant_dims.d[0] = 1; + auto* constant_layer = TRT_ENGINE_ADD_LAYER( + engine_, Constant, constant_dims, constant_weight.get()); + std::vector itensors; + itensors.push_back(constant_layer->getOutput(0)); + itensors.push_back(reduce_prod_layer->getOutput(0)); + auto* concat_layer = + TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(), 2); + concat_layer->setAxis(0); + layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + layer->setInput(1, *(concat_layer->getOutput(0))); } - nvinfer1::Dims flatten_dim; - flatten_dim.nbDims = 1; - flatten_dim.d[0] = dim_prod; - auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); - layer->setReshapeDimensions(flatten_dim); - auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "flatten", {output_name}, test_mode); } diff --git a/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..489fc987dfec2a13b4baccb06911c940b627d908 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/gather_nd_op.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class GatherNdOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert a paddle gather_nd op to tensorrt gather_nd plugin"; + framework::OpDesc op_desc(op, nullptr); + + // Declare inputs + std::vector inputs; + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + auto* index = engine_->GetITensor(op_desc.Input("Index")[0]); + inputs.emplace_back(input); + inputs.emplace_back(index); + + nvinfer1::ILayer* layer = nullptr; + bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::GatherNdPluginDynamic* plugin = + new plugin::GatherNdPluginDynamic(with_fp16); + layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin); + + std::string layer_name = "gather_nd (Output: "; + auto output_name = op_desc.Output("Out")[0]; + layer->getOutput(0)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(0)); + layer_name += output_name; + if (test_mode) { + engine_->DeclareOutput(output_name); + } + layer->setName((layer_name + ")").c_str()); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(gather_nd, GatherNdOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc index 2fd0d82bb1ea34af4e3d6dc9efb581ff9bd49916..b7097fc05680d4b161798f31c25386b3183b5329 100644 --- a/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/instance_norm_op.cc @@ -74,7 +74,7 @@ class InstanceNormOpConverter : public OpConverter { plugin::InstanceNormPlugin* plugin = new plugin::InstanceNormPlugin(eps, scale_v, bias_v); plugin->getPluginType(); - nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, 1, plugin); + auto* layer = engine_->AddPlugin(&input, 1, plugin); auto output_name = op_desc.Output("Y")[0]; RreplenishLayerAndOutput(layer, "instance_norm", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index 0b97b5d87a3d506e9e14ea5780a9e7b4ac471c83..de5d3110e189030568b3dfeb5a04e5dbe249ae58 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -46,13 +46,6 @@ class LayerNormOpConverter : public OpConverter { auto* Bias_t = Bias_v->GetMutable(); auto* Scale_t = Scale_v->GetMutable(); - int input_num = 1; - for (int i = 0; i < X->getDimensions().nbDims; i++) { - input_num *= X->getDimensions().d[i]; - } - std::vector mean_shape{input_num}; - std::vector variance_shape{input_num}; - std::unique_ptr bias_tensor( new framework::LoDTensor()); std::unique_ptr scale_tensor( @@ -68,10 +61,33 @@ class LayerNormOpConverter : public OpConverter { auto* bias_data = bias_tensor->mutable_data(platform::CPUPlace()); auto* scale_data = scale_tensor->mutable_data(platform::CPUPlace()); - plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin( - bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(), - begin_norm_axis, eps, mean_shape, variance_shape); - nvinfer1::IPluginLayer* layernorm_layer = engine_->AddPlugin(&X, 1, plugin); + nvinfer1::ILayer* layernorm_layer = nullptr; + if (engine_->with_dynamic_shape()) { + int input_num = 1; + for (int i = begin_norm_axis; i < X->getDimensions().nbDims; i++) { + input_num *= X->getDimensions().d[i]; + } + std::vector mean_shape{input_num}; + std::vector variance_shape{input_num}; + plugin::LayerNormPluginDynamic* plugin = + new plugin::LayerNormPluginDynamic(bias_data, bias_tensor->numel(), + scale_data, scale_tensor->numel(), + begin_norm_axis, eps, mean_shape, + variance_shape); + layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin); + } else { + int input_num = 1; + for (int i = begin_norm_axis - 1; i < X->getDimensions().nbDims; i++) { + input_num *= X->getDimensions().d[i]; + } + std::vector mean_shape{input_num}; + std::vector variance_shape{input_num}; + plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin( + bias_data, bias_tensor->numel(), scale_data, scale_tensor->numel(), + begin_norm_axis, eps, mean_shape, variance_shape); + layernorm_layer = engine_->AddPlugin( + &X, 1, reinterpret_cast(plugin)); + } auto output_name = op_desc.Output("Y").front(); engine_->SetWeights(op_desc.Input("Bias").front(), std::move(bias_tensor)); diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc index a182119776edd9ba901f0469597341578ee687b1..0358c86926bec2244108bb398d2df7b1816e8064 100644 --- a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc @@ -45,9 +45,16 @@ class MatMulOpConverter : public OpConverter { bool transpose_X = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_X")); bool transpose_Y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y")); - auto* layer = TRT_ENGINE_ADD_LAYER( - engine_, MatrixMultiply, *const_cast(input1), - transpose_X, *const_cast(input2), transpose_Y); + nvinfer1::MatrixOperation matrix_operation_X = + transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE + : nvinfer1::MatrixOperation::kNONE; + nvinfer1::MatrixOperation matrix_operation_Y = + transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE + : nvinfer1::MatrixOperation::kNONE; + + auto* layer = + TRT_ENGINE_ADD_LAYER(engine_, MatrixMultiply, *input1, + matrix_operation_X, *input2, matrix_operation_Y); float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index f2f45c694ab44fb03cfd6b018ef0a0a1ae6f0a31..d05c9019a29d3980c701a55629b1deb04a1ddb0b 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter { std::vector plugin_inputs; plugin_inputs.emplace_back(fc_layer->getOutput(0)); plugin_inputs.emplace_back(mask_tensor); - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(2)->getName())); // cu_seqlens, - // eval_placeholder_2 + if (engine_->Has("ernie_pos_name")) { + plugin_inputs.emplace_back( + engine_->GetITensor(engine_->Get("ernie_pos_name"))); + } else { + plugin_inputs.emplace_back(engine_->GetITensor( + engine_->network() + ->getInput(2) + ->getName())); // cu_seqlens, eval_placeholder_2 + } auto max_seqlen_tensor = engine_->GetITensor(engine_->network()->getInput(3)->getName()); auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 8de16df0a2f610b30da389bc73e122074d66471e..57a26aec6ebcb3d1350ec560927b76bf1988d64b 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -109,6 +109,12 @@ class OpConverter { it, platform::errors::Unimplemented("no OpConverter for optype [%s]", op_desc.Type())); } + if (op_desc.Type() == "depthwise_conv2d_transpose") { + it = Registry::Global().Lookup("conv2d_transpose"); + PADDLE_ENFORCE_NOT_NULL( + it, platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); + } if (op_desc.Type() == "transpose2") { it = Registry::Global().Lookup("transpose"); PADDLE_ENFORCE_NOT_NULL( @@ -121,6 +127,13 @@ class OpConverter { it, platform::errors::Unimplemented("no OpConverter for optype [%s]", op_desc.Type())); } + // reshape2 == reshape + if (op_desc.Type() == "reshape2") { + it = Registry::Global().Lookup("reshape"); + PADDLE_ENFORCE_NOT_NULL( + it, platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); + } if (!it) { it = Registry::Global().Lookup(op_desc.Type()); } diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..66d2680fe9969cf7857130f1aa6e6aef742ca805 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class ReduceSumOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer"; + framework::OpDesc op_desc(op, nullptr); + + auto* x = engine_->GetITensor(op_desc.Input("X").front()); + nvinfer1::Dims input_shape = x->getDimensions(); + int input_dims = input_shape.nbDims; + + bool keep_dim = BOOST_GET_CONST(bool, op_desc.GetAttr("keep_dim")); + std::vector dim = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("dim")); + bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all")); + + // Now we only support dynamic_shape mode. + nvinfer1::IReduceLayer* layer = nullptr; + if (reduce_all) { + uint32_t reduce_dim = 0; + for (int i = 0; i < input_dims; ++i) { + reduce_dim |= 1 << i; + } + layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x, + nvinfer1::ReduceOperation::kSUM, reduce_dim, + keep_dim); + } else { + auto CvtToBitMask = [&](const std::vector& dims) -> uint32_t { + uint32_t res = 0; + for (auto x : dims) { + if (x < 0) { + res |= 1 << (x + input_dims); + } else { + res |= 1 << x; + } + } + return res; + }; + layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x, + nvinfer1::ReduceOperation::kSUM, + CvtToBitMask(dim), keep_dim); + } + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/reshape_op.cc b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..489603e20cda2f1143fd4791c8cbe5e8e58e4148 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/reshape_op.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * ReshapeOp + */ +class ReshapeOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + std::vector shape = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("shape")); + int nbDims_num = shape.size(); + nvinfer1::Dims reshape_dim; + if (engine_->with_dynamic_shape()) { // running the TRT Dynamic Shape mode + reshape_dim.nbDims = nbDims_num; + for (int i = 0; i < nbDims_num; ++i) { + reshape_dim.d[i] = shape[i]; + } + } else { // running the TRT Static Shape mode + reshape_dim.nbDims = nbDims_num - 1; + for (int i = 0; i < nbDims_num - 1; ++i) { + reshape_dim.d[i] = shape[i + 1]; + } + } + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + layer->setReshapeDimensions(reshape_dim); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "reshape", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(reshape, ReshapeOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc index 0fdc262f7e740bc577bdb21a457d4288fcf7bf94..976fe9502acd6611d933b3af29187c7320a1f7e4 100644 --- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc @@ -57,11 +57,12 @@ class ShuffleChannelOpConverter : public OpConverter { auto* output = layer->getOutput(0); auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output); - nvinfer1::DimsCHW reshape_dim2(c, h, w); + nvinfer1::Dims3 reshape_dim2(c, h, w); reshape_layer->setReshapeDimensions(reshape_dim2); auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(reshape_layer, "concat", {output_name}, test_mode); + RreplenishLayerAndOutput(reshape_layer, "shuffle_channel", {output_name}, + test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 2ab024dff327fda45faab01afbfbe38bb7244f93..7f270b1f390b7428aa40425ebfb2adb4d02620a8 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -76,9 +76,16 @@ class SliceOpConverter : public OpConverter { std::vector plugin_inputs; // plugin_inputs.emplace_back(trans_layer->getOutput(0)); plugin_inputs.emplace_back(input); - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(2)->getName())); // cu_seqlens, - // eval_placeholder_2 + + std::string pos_name; + if (engine_->Has("ernie_pos_name")) { + pos_name = engine_->Get("ernie_pos_name"); + } else { + // hard code for compatibility + pos_name = engine_->network()->getInput(2)->getName(); + } + plugin_inputs.emplace_back( + engine_->GetITensor(pos_name)); // cu_seqlens, eval_placeholder_2 // bool ban_fp16 = engine_->disable_trt_plugin_fp16(); plugin::SpecialSlicePluginDynamic* plugin = diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc index 41412cb079540da72760558379b158b6538aa6a8..92e34e48bdb295eca9e8ce7a86a7d7435a37bab7 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc @@ -28,12 +28,12 @@ TEST(batch_norm_op, test) { TRTConvertValidation validator(5, parameters, scope, 1 << 15); std::vector param_shape{2}; - validator.DeclInputVar("batch_norm_X", nvinfer1::DimsCHW(2, 5, 5)); + validator.DeclInputVar("batch_norm_X", nvinfer1::Dims3(2, 5, 5)); validator.DeclParamVar("batch_norm_scale", param_shape); validator.DeclParamVar("batch_norm_bias", param_shape); validator.DeclParamVar("batch_norm_mean", param_shape); validator.DeclParamVar("batch_norm_variance", param_shape); - validator.DeclOutputVar("batch_norm_Y", nvinfer1::DimsCHW(2, 5, 5)); + validator.DeclOutputVar("batch_norm_Y", nvinfer1::Dims3(2, 5, 5)); validator.DeclOutputVar("batch_norm_save_mean", param_shape); validator.DeclOutputVar("batch_norm_save_variance", param_shape); diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc index 4f284a4db5758e072915d7fd0f16115b8a36ba8b..6c876964297f94db27b0d683571f99f0605a68f3 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc @@ -24,10 +24,10 @@ TEST(concat_op, test) { std::unordered_set parameters({""}); framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1)); - validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1)); - validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1)); - validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1)); + validator.DeclInputVar("concat_x1", nvinfer1::Dims3(10, 3, 1)); + validator.DeclInputVar("concat_x2", nvinfer1::Dims3(3, 3, 1)); + validator.DeclInputVar("concat_x3", nvinfer1::Dims3(7, 3, 1)); + validator.DeclOutputVar("concat_out", nvinfer1::Dims3(20, 3, 1)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index 81e905b975327125fddc8a33d871cc97290e4ac1..474fd92071fb0795b868f0cd86591061cf8b6581 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -25,10 +25,9 @@ TEST(DropoutOpConverter, main) { TRTConvertValidation validator(8, parameters, scope, 1000); std::vector tensor_shape{8, 10}; - validator.DeclInputVar("dropout-X", tensor_shape, - nvinfer1::DimsCHW(10, 1, 1)); - validator.DeclOutputVar("dropout-Out", nvinfer1::DimsCHW(10, 1, 1)); - validator.DeclOutputVar("mask-Out", nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclInputVar("dropout-X", tensor_shape, nvinfer1::Dims3(10, 1, 1)); + validator.DeclOutputVar("dropout-Out", nvinfer1::Dims3(10, 1, 1)); + validator.DeclOutputVar("mask-Out", nvinfer1::Dims3(10, 1, 1)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc index cc967464a5f29151a061e99cda6870f9f370ec1b..17adf957f64a76a010da6160479be2125d9deac9 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -24,9 +24,9 @@ TEST(elementwise_op, add_weight) { std::unordered_set parameters({"elementwise_add-Y"}); framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1 << 15); - validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclInputVar("elementwise_add-X", nvinfer1::Dims3(10, 3, 3)); validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1)); - validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclOutputVar("elementwise_add-Out", nvinfer1::Dims3(10, 3, 3)); // Prepare Op description framework::OpDesc desc; @@ -50,11 +50,11 @@ TEST(elementwise_op, native) { framework::Scope scope; TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15); validator.DeclInputVar("elementwise_" + type + "-X", - nvinfer1::DimsCHW(10, 3, 3)); + nvinfer1::Dims3(10, 3, 3)); validator.DeclInputVar("elementwise_" + type + "-Y", nvinfer1::Dims3(10, 3, 3)); validator.DeclOutputVar("elementwise_" + type + "-Out", - nvinfer1::DimsCHW(10, 3, 3)); + nvinfer1::Dims3(10, 3, 3)); // Prepare Op description framework::OpDesc desc; @@ -78,11 +78,11 @@ TEST(elementwise_op, plugin) { framework::Scope scope; TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15); validator.DeclInputVar("elementwise_" + type + "-X", - nvinfer1::DimsCHW(10, 3, 3)); + nvinfer1::Dims3(10, 3, 3)); validator.DeclInputVar("elementwise_" + type + "-Y", nvinfer1::Dims3(10, 1, 1)); validator.DeclOutputVar("elementwise_" + type + "-Out", - nvinfer1::DimsCHW(10, 3, 3)); + nvinfer1::Dims3(10, 3, 3)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc index d00826af075159004d3727a7519e7c319dbddb02..1725888abc379bfa4ffbbc5cfc4cecd1872c7c18 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -24,8 +24,8 @@ TEST(leaky_relu_op, test_leaky_relu) { std::unordered_set parameters; framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2)); - validator.DeclOutputVar("leaky_relu_out", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclInputVar("leaky_relu_input", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("leaky_relu_out", nvinfer1::Dims3(3, 2, 2)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc index b086c910d38a243d98315f2d6eb82ecc0ec5c06d..f2541ff7c0b5e5a49b78a700f1fccfed377e4acc 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc @@ -24,9 +24,9 @@ TEST(prelu_op, test_channel_wise) { std::unordered_set parameters({"prelu_alpha"}); framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclInputVar("prelu_input", nvinfer1::Dims3(3, 2, 2)); validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1)); - validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("prelu_out", nvinfer1::Dims3(3, 2, 2)); // Prepare Op description framework::OpDesc desc; @@ -46,9 +46,9 @@ TEST(prelu_op, test_element_wise) { std::unordered_set parameters({"prelu_alpha"}); framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclInputVar("prelu_input", nvinfer1::Dims3(3, 2, 2)); validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2)); - validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("prelu_out", nvinfer1::Dims3(3, 2, 2)); // Prepare Op description framework::OpDesc desc; @@ -68,9 +68,9 @@ TEST(prelu_op, test_scalar) { std::unordered_set parameters({"prelu_alpha"}); framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclInputVar("prelu_input", nvinfer1::Dims3(3, 2, 2)); validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1)); - validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("prelu_out", nvinfer1::Dims3(3, 2, 2)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc index e3cc5273734e02ecc4ed6453e6cd47052463c8b2..3ebb51afdf44f488d5acb7481be0ce6714324454 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc @@ -24,8 +24,8 @@ TEST(leaky_relu_op, test_leaky_relu) { std::unordered_set parameters; framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("sc_input", nvinfer1::DimsCHW(4, 2, 2)); - validator.DeclOutputVar("sc_out", nvinfer1::DimsCHW(4, 2, 2)); + validator.DeclInputVar("sc_input", nvinfer1::Dims3(4, 2, 2)); + validator.DeclOutputVar("sc_out", nvinfer1::Dims3(4, 2, 2)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc index 503ce71f7fb4377bb4304569b7484fb25abdb284..b6fdcddf309d85a68ea67f33c157fbcf5ce5affc 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc @@ -25,9 +25,8 @@ TEST(SoftMaxOpConverter, main) { TRTConvertValidation validator(8, parameters, scope, 1000); std::vector tensor_shape{8, 10}; - validator.DeclInputVar("softmax-X", tensor_shape, - nvinfer1::DimsCHW(10, 1, 1)); - validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclInputVar("softmax-X", tensor_shape, nvinfer1::Dims3(10, 1, 1)); + validator.DeclOutputVar("softmax-Out", nvinfer1::Dims3(10, 1, 1)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index 5aacc5c600dd1371e3865adc888bb8e24640e7d9..3b6a4a80044eb6853e3e689b9d2f71317a7d7839 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -28,7 +28,7 @@ void TensorRTSplitTest(const std::vector &in_shape, TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000); auto make_dim = [](const std::vector &shape) { - nvinfer1::DimsCHW dim; + nvinfer1::Dims3 dim; dim.c() = shape[0]; dim.h() = shape[1]; dim.w() = shape[2]; diff --git a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc index c15c79bb13fad4233775482dc1b8b4841e61a23a..7a5a886affed33bdb35b741889f7a2635576543a 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc @@ -24,8 +24,8 @@ TEST(swish_op, test_swish) { std::unordered_set parameters; framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("sw_input", nvinfer1::DimsCHW(3, 2, 2)); - validator.DeclOutputVar("sw_out", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclInputVar("sw_input", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("sw_out", nvinfer1::Dims3(3, 2, 2)); // Prepare Op description framework::OpDesc desc; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 99549fd6b5cbf96cf803e7f44b28c948daf0763d..e77e12713ca202b0f28198fcaba3dae2bd5ad1fa 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -34,17 +34,15 @@ void TensorRTEngine::InitNetwork() { infer_builder_.reset(createInferBuilder(&logger_)); if (with_dynamic_shape_) { -#if IS_TRT_VERSION_GE(6000) - infer_networkv2_.reset(infer_builder_->createNetworkV2( + infer_network_.reset(infer_builder_->createNetworkV2( 1U << static_cast( nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH))); - infer_builder_config_.reset(infer_builder_->createBuilderConfig()); - infer_ptr infer_builder_config_; - optim_profile_ = infer_builder_->createOptimizationProfile(); -#endif } else { - infer_network_.reset(infer_builder_->createNetwork()); + infer_network_.reset(infer_builder_->createNetworkV2(0U)); } + + infer_builder_config_.reset(infer_builder_->createBuilderConfig()); + optim_profile_ = infer_builder_->createOptimizationProfile(); } void TensorRTEngine::Execute(int batch_size, std::vector *buffers, @@ -73,12 +71,12 @@ void TensorRTEngine::FreezeNetwork() { "Call InitNetwork first to initialize network.")); // build engine. infer_builder_->setMaxBatchSize(max_batch_); - infer_builder_->setMaxWorkspaceSize(max_workspace_); + infer_builder_config_->setMaxWorkspaceSize(max_workspace_); + bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf); -#if IS_TRT_VERSION_GE(5000) if (enable_fp16) { bool support_fp16 = infer_builder_->platformHasFastFp16(); - infer_builder_->setFp16Mode(support_fp16); + infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); if (!support_fp16) { LOG(INFO) << "You specify FP16 mode, but the hardware do not support " "FP16 speed up, use FP32 instead."; @@ -86,23 +84,19 @@ void TensorRTEngine::FreezeNetwork() { LOG(INFO) << "Run Paddle-TRT FP16 mode"; } } -#else - if (enable_fp16) - LOG(INFO) << "Using FP16 in Paddle-TRT must ensure that the version of TRT " - "is at least 5." - "So, use FP32 to run."; -#endif - bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8); + bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8); if (enable_int8) { - infer_builder_->setInt8Mode(true); + infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); + infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); + infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES); + if (calibrator_) { - infer_builder_->setInt8Calibrator(calibrator_); + infer_builder_config_->setInt8Calibrator(calibrator_); } else { - infer_builder_->setInt8Calibrator(nullptr); + infer_builder_config_->setInt8Calibrator(nullptr); #if IS_TRT_VERSION_GE(5000) - infer_builder_->setStrictTypeConstraints(true); for (auto &quant_range : quant_dynamic_range_) { auto tensor = quant_range.first; float range = quant_range.second; @@ -116,6 +110,7 @@ void TensorRTEngine::FreezeNetwork() { all_t.insert(layer->getOutput(j)); } } + for (int i = 0; i < network()->getNbInputs(); i++) { all_t.insert(network()->getInput(i)); } @@ -127,6 +122,7 @@ void TensorRTEngine::FreezeNetwork() { << ", this might be ok when trt does not need this range"; } } + #if IS_TRT_VERSION_GE(5122) auto is_layer_int8 = [&](nvinfer1::ILayer *layer) -> bool { for (int j = 0; j < layer->getNbInputs(); j++) { @@ -189,9 +185,9 @@ void TensorRTEngine::FreezeNetwork() { << infer_builder_->getNbDLACores() << ", but got " << dla_core_ << ", so use use 0 as default."; } - infer_builder_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); - infer_builder_->setDLACore(dla_core_); - infer_builder_->allowGPUFallback(true); + infer_builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA); + infer_builder_config_->setDLACore(dla_core_); + infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK); LOG(INFO) << "TensorRT DLA enabled in FreezeNetwork(), DLACore " << dla_core_; } @@ -212,30 +208,18 @@ void TensorRTEngine::FreezeNetwork() { Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true)); } infer_builder_config_->addOptimizationProfile(optim_profile_); - infer_builder_config_->setMaxWorkspaceSize(max_workspace_); - if (enable_int8) { - // Due to a bug of TRT, we must set precision BuilderFlag to kFP16 before - // kINT8 here to perform INT8 inference. - infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); - infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); - infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES); - } - if (WithFp16()) { - infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); - if (disable_trt_plugin_fp16()) { - LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have " - "disabled the fp16 mode of TRT Plugin,\n" - << "you can reopen it with " - "'config.SetDynamicShapeInfo(min_shape, max_shape, " - "opt_shape, false /*disable_trt_plugin_fp16*/)'"; - } + if (WithFp16() && disable_trt_plugin_fp16()) { + LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have " + "disabled the fp16 mode of TRT Plugin,\n" + << "you can reopen it with " + "'config.SetDynamicShapeInfo(min_shape, max_shape, " + "opt_shape, false /*disable_trt_plugin_fp16*/)'"; } - infer_engine_.reset(infer_builder_->buildEngineWithConfig( - *network(), *infer_builder_config_)); #endif - } else { - infer_engine_.reset(infer_builder_->buildCudaEngine(*network())); } + infer_engine_.reset(infer_builder_->buildEngineWithConfig( + *network(), *infer_builder_config_)); + PADDLE_ENFORCE_NOT_NULL( infer_engine_, platform::errors::Fatal( "Build TensorRT cuda engine failed! Please recheck " @@ -346,11 +330,11 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name, int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } -nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( +nvinfer1::IPluginV2Layer *TensorRTEngine::AddPlugin( nvinfer1::ITensor *const *inputs, int num_inputs, plugin::PluginTensorRT *plugin) { owned_plugin_.emplace_back(plugin); - return network()->addPluginExt(inputs, num_inputs, *plugin); + return network()->addPluginV2(inputs, num_inputs, *plugin); } nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext( diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 2358e1ef976cdbc26eb907aff21b81f7e52d64d9..38c453bde6d2db2581056e0c9019904d2411de94 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -30,7 +30,6 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -102,7 +101,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", input, ShapeStr(shape))); } - return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + return nvinfer1::Dims3(shape[1], shape[2], shape[3]); } else if (shape.size() == 3UL) { if (shape[1] == -1 || shape[2] == -1) { PADDLE_THROW(platform::errors::InvalidArgument( @@ -112,10 +111,10 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, } return nvinfer1::Dims2(shape[1], shape[2]); } - return nvinfer1::DimsCHW(shape[1], 1, 1); + return nvinfer1::Dims3(shape[1], 1, 1); } else { if (shape.size() == 4UL) { - return nvinfer1::DimsNCHW(shape[0], shape[1], shape[2], shape[3]); + return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); } else if (shape.size() == 3UL) { return nvinfer1::Dims3(shape[0], shape[1], shape[2]); } @@ -202,7 +201,15 @@ class TensorRTEngine { dy::initLibNvInferPlugins(&logger, ""); } - ~TensorRTEngine() {} + ~TensorRTEngine() { + for (auto& attr : attrs_) { + if (attr_dels_.find(attr.first) != attr_dels_.end()) { + attr_dels_[attr.first](); + } + } + attrs_.clear(); + attr_dels_.clear(); + } // Add an input and set its name, data type and dimension. nvinfer1::ITensor* DeclareInput(const std::string& name, @@ -268,23 +275,9 @@ class TensorRTEngine { } } - if (with_dynamic_shape_) { -#if IS_TRT_VERSION_GE(6000) - infer_engine_.reset(runtime->deserializeCudaEngine( - engine_serialized_data.c_str(), engine_serialized_data.size(), - nullptr)); -#else - - PADDLE_THROW(platform::errors::PreconditionNotMet( - "To enable dynamic shape support, the TensorRT version should be " - "greater than 6.0.0")); + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size())); -#endif - } else { - infer_engine_.reset(runtime->deserializeCudaEngine( - engine_serialized_data.c_str(), engine_serialized_data.size(), - &inference::Singleton::Global())); - } PADDLE_ENFORCE_NOT_NULL( infer_engine_, platform::errors::Fatal( @@ -306,8 +299,8 @@ class TensorRTEngine { int GetDeviceId() { return device_id_; } - nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, - int num_inputs, plugin::PluginTensorRT*); + nvinfer1::IPluginV2Layer* AddPlugin(nvinfer1::ITensor* const* inputs, + int num_inputs, plugin::PluginTensorRT*); nvinfer1::IPluginV2Layer* AddPluginV2Ext(nvinfer1::ITensor* const* inputs, int num_inputs, @@ -361,13 +354,7 @@ class TensorRTEngine { void Execute(int batch_size, std::vector* buffers, cudaStream_t stream = nullptr); - nvinfer1::INetworkDefinition* network() { - if (with_dynamic_shape_) { - return infer_networkv2_.get(); - } else { - return infer_network_.get(); - } - } + nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } ShapeMapType min_input_shape() { return min_input_shape_; } ShapeMapType max_input_shape() { return max_input_shape_; } @@ -386,6 +373,82 @@ class TensorRTEngine { } #endif + bool Has(const std::string& attr_name) const { + return attrs_.count(attr_name) > 0; + } + + void Erase(const std::string& attr_name) { + if (!Has(attr_name)) { + return; + } + if (attr_dels_.find(attr_name) != attr_dels_.end()) { + attr_dels_[attr_name](); + attr_dels_.erase(attr_name); + } + attrs_.erase(attr_name); + } + + // Set a pointer to the attribute. Engine takes ownership of the attribute. + template + void Set(const std::string& attr_name, AttrType* attr) { + if (attrs_.count(attr_name) == 0) { + PADDLE_ENFORCE_EQ( + attrs_.count(attr_name), 0, + platform::errors::AlreadyExists( + "Attribute %s already set in trt engine.", attr_name)); + } else { + VLOG(3) << "Setting the attribute " << attr_name << " for trt engine " + << this; + } + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + // Set a pointer to the attribute. Engine doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string& attr_name, AttrType* attr) { + PADDLE_ENFORCE_EQ( + attrs_.count(attr_name), 0, + platform::errors::AlreadyExists( + "Attribute %s already set in trt engine.", attr_name)); + attrs_[attr_name] = attr; + } + + // Get a reference to the attributed previously set. + template + AttrType& Get(const std::string& attr_name) const { + PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(), + platform::errors::InvalidArgument( + "Attribute %s not found in trt engine.", attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast&) { + auto TypeToString = [](const std::type_info& info) -> std::string { + if (std::type_index(info) == std::type_index(typeid(bool*))) { + return "bool"; + } else if (std::type_index(info) == std::type_index(typeid(int*))) { + return "int"; + } else if (std::type_index(info) == + std::type_index(typeid(const int*))) { + return "const int"; + } else if (std::type_index(info) == + std::type_index(typeid(std::string*))) { + return "std::string"; + } + return info.name(); + }; + + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid type for attritube %s, expected: %s, actual: %s.", attr_name, + TypeToString(typeid(AttrType*)), + TypeToString(attrs_.at(attr_name).type()))); + } + } + private: // Each ICudaEngine object is bound to a specific GPU when it is instantiated, // ensure that the thread is associated with the correct device by calling @@ -441,9 +504,11 @@ class TensorRTEngine { infer_ptr ihost_memory_; std::unordered_map quant_dynamic_range_; + std::unordered_map attrs_; + std::unordered_map> attr_dels_; + // For dynamic shape bool with_dynamic_shape_{false}; - infer_ptr infer_networkv2_; #if IS_TRT_VERSION_GE(6000) infer_ptr infer_builder_config_; nvinfer1::IOptimizationProfile* optim_profile_; diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 6158fd130bad8d4df70fafb2a9f72c00e40217fd..e3c7d8b10333c322be455c1f74912b2fb11ccb75 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -31,6 +31,10 @@ namespace tensorrt { ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version) +#define IS_TRT_VERSION_LT(version) \ + ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ + NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) < version) + #define TRT_VERSION \ NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD @@ -130,6 +134,19 @@ inline size_t ProductDim(const nvinfer1::Dims& dims) { return v; } +inline void PrintITensorShape(nvinfer1::ITensor* X) { + auto dims = X->getDimensions(); + auto name = X->getName(); + std::cout << "ITensor " << name << " shape: ["; + for (int i = 0; i < dims.nbDims; i++) { + if (i == dims.nbDims - 1) + std::cout << dims.d[i]; + else + std::cout << dims.d[i] << ", "; + } + std::cout << "]\n"; +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index c8dfc169535da01ea7b2afb97f51a8d67b2dfa43..f98b0c9ede76e2ec542a0c1d74ea13d0201e57f9 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/op_teller.h" + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/data_layout.h" @@ -49,6 +50,10 @@ struct SimpleOpTypeSetTeller : public Teller { #endif #if IS_TRT_VERSION_GE(7130) teller_set.insert("group_norm"); +#endif +#if CUDA_VERSION >= 10020 + teller_set.insert("reshape"); + teller_set.insert("reshape2"); #endif } @@ -102,6 +107,7 @@ struct SimpleOpTypeSetTeller : public Teller { "dropout", "prelu", "conv2d_transpose", + "depthwise_conv2d_transpose", "leaky_relu", "fc", "shuffle_channel", @@ -117,11 +123,13 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + "gather_nd", "yolo_box", "roi_align", "affine_channel", "nearest_interp", "anchor_generator", + "reduce_sum", }; }; @@ -172,7 +180,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } if (op_type == "conv2d" || op_type == "conv2d_transpose" || - op_type == "conv2d_fusion") { + op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" || + op_type == "depthwise_conv2d_transpose") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); @@ -202,7 +211,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } - if (op_type == "conv2d_transpose") { + if (op_type == "conv2d_transpose" || + op_type == "depthwise_conv2d_transpose") { if (!desc.HasAttr("dilations")) { return false; } else { @@ -222,6 +232,27 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << desc.Output("Output").size() << " output."; return false; } + +// strides > 1 and 'SAME' is only supported by trt7.0 above +#if !IS_TRT_VERSION_GE(7000) + if (op_type == "conv2d" || op_type == "conv2d_fusion" || + op_type == "depthwise_conv2d") { + if (desc.HasAttr("padding_algorithm") && with_dynamic_shape) { + auto padding_algorithm = + BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm")); + if (padding_algorithm == "SAME" && desc.HasAttr("strides")) { + const std::vector strides = + BOOST_GET_CONST(std::vector, desc.GetAttr("strides")); + // there is no issue if strides.size() less than 2 + if (strides.size() > 1) { + for (size_t i = 0; i < strides.size(); i++) { + if (strides[i] > 1) return false; + } + } + } + } + } +#endif } if (op_type == "matmul") { @@ -269,31 +300,51 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false; } } - if (op_type == "flatten2") { - // flatten doesn't support dynamic shape currently + if (op_type == "flatten2" || op_type == "flatten") { if (!desc.HasAttr("axis")) { return false; } else { +#if IS_TRT_VERSION_GE(7130) +#else if (with_dynamic_shape) return false; +#endif int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); if (axis != 1) return false; } } - if (op_type == "flatten") { - // flatten doesn't support dynamic shape currently - if (!desc.HasAttr("axis")) { - return false; - } else { - if (with_dynamic_shape) return false; - int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); - if (axis != 1) return false; + if (op_type == "gather") { + if (!with_dynamic_shape) return false; + auto inputs = desc.InputArgumentNames(); + for (auto& input : inputs) { + if (input == "Axis" && desc.Input("Axis").size() > 0) return false; } + // current not support axis from input, use default 0 + if (desc.GetAttrIfExists("axis")) return false; } - if (op_type == "gather") { - // current not support axis from input, use default 0 - if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; + if (op_type == "gather_nd") { + auto* block = desc.Block(); + auto x_var_name = desc.Input("X")[0]; + auto index_var_name = desc.Input("Index")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + auto* index_var_desc = block->FindVar(index_var_name); + + // The index input must be int32 datatype. + if (index_var_desc->GetDataType() != + paddle::framework::proto::VarType_Type::VarType_Type_INT32) { + VLOG(3) << "gather_nd op Index input data type must be int32"; + return false; + } + + const auto index_shape = index_var_desc->GetShape(); + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() != index_shape.size()) { + VLOG(3) << "gather_nd op Index input dims size [" << index_shape.size() + << " ] not equal to x dims size [" << x_shape.size() << "]"; + return false; + } + if (!with_dynamic_shape) return false; } if (op_type == "yolo_box") { @@ -630,8 +681,52 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "fc") { + int x_num_col_dims = + desc.HasAttr("x_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims")) + : (desc.HasAttr("in_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims")) + : 1); + if (x_num_col_dims < 1) { + VLOG(3) << "converter expects x_num_col_dims >= 1, " + "but x_num_col_dims = %d."; + return false; + } + } + if (op_type == "reshape" || op_type == "reshape2") { + if (!desc.HasAttr("shape")) { + return false; + } + // Paddle-TRT does not support the input tensors: Shape and ShapeTensor + if (desc.Input("Shape").size() >= 1 || + desc.Input("ShapeTensor").size() >= 1) { + return false; + } + std::vector shape = + BOOST_GET_CONST(std::vector, desc.GetAttr("shape")); + if (shape.size() >= nvinfer1::Dims::MAX_DIMS) return false; + if (!with_dynamic_shape && shape[0] == -1) return false; + } + + if (op_type == "reduce_sum") { + if (!with_dynamic_shape) { + VLOG(3) << "the reduce_sum does not support static shape yet"; + return false; + } + + if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") && + desc.HasAttr("reduce_all"))) { + VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or " + "reduce_all)"; + return false; + } + } + if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } + + VLOG(3) << "trt unsupported op " << op_type; return false; } diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 1804e6c5571d3a15b0b9adc67dc535b46635caa8..311c2312a9f45b2e8a7b93750c95b95d73b07fc9 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,6 +1,6 @@ nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu - prelu_op_plugin.cu trt_plugin_factory.cc gelu_op_plugin.cu + prelu_op_plugin.cu gelu_op_plugin.cu pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu @@ -8,6 +8,7 @@ nv_library(tensorrt_plugin anchor_generator_op_plugin.cu yolo_box_op_plugin.cu roi_align_op_plugin.cu + gather_nd_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu index 01ee86ceb48a9ef022ba73fe0dbdab4a52324cc6..8cf9178b6f139ba62b72640ed575fde951eb4d48 100644 --- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu @@ -18,8 +18,6 @@ #include #include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" - #include "paddle/fluid/operators/detection/anchor_generator_op.h" namespace paddle { @@ -166,7 +164,11 @@ int AnchorGeneratorPlugin::enqueue_impl(int batch_size, } int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs, +#if IS_TRT_VERSION_LT(8000) void** outputs, void* workspace, +#else + void* const* outputs, void* workspace, +#endif cudaStream_t stream) { return enqueue_impl(batch_size, inputs, outputs, workspace, stream); } @@ -215,7 +217,7 @@ const char* AnchorGeneratorPlugin::getPluginNamespace() const { nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType( int index, const nvinfer1::DataType* input_type, int nb_inputs) const { - return data_type_; + return input_type[0]; } bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch( @@ -456,7 +458,7 @@ int AnchorGeneratorPluginDynamic::enqueue( nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { - return data_type_; + return inputTypes[0]; } const char* AnchorGeneratorPluginDynamic::getPluginType() const { diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h index aff0b6a6802f114a25acf32627a39ca42d572d7c..458326d0679ca96df16db1287139de986f2f3cb4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h @@ -42,7 +42,11 @@ class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext { bool supportsFormat(nvinfer1::DataType type, nvinfer1::TensorFormat format) const override; size_t getWorkspaceSize(int max_batch_size) const override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + int enqueue(int batch_size, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; int initialize() override; void terminate() override; diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index cc17f8aa2481708e3e19c9925a1d83ad06203145..3338aae370e51452c3d390a23f47a8848e6f9236 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -14,19 +14,12 @@ limitations under the License. */ #include #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { -ElementWisePlugin *CreateElementWisePluginDeserialize(const void *buffer, - size_t length) { - return new ElementWisePlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize); - namespace details { template struct Add { @@ -122,7 +115,11 @@ int ElementWisePlugin::initialize() { } int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, +#else + void *const *outputs, void *workspace, +#endif cudaStream_t stream) { const float *x = reinterpret_cast(inputs[0]); const float *y = reinterpret_cast(inputs[1]); diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index 75a1dd85f0f2c440fdd16beb95144df4127739e6..5dd3142c758398ab6124124ec98a1f141f103d1c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -40,14 +40,16 @@ class ElementWisePlugin : public PluginTensorRT { const char* elementwise_type; DeserializeValue(&serial_data, &serial_length, &elementwise_type); type_ = std::string(elementwise_type); - DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &dims_x_); DeserializeValue(&serial_data, &serial_length, &dims_y_); + DeserializeValue(&serial_data, &serial_length, &axis_); + DeserializeValue(&serial_data, &serial_length, &prev_size_); + DeserializeValue(&serial_data, &serial_length, &midd_size_); + DeserializeValue(&serial_data, &serial_length, &post_size_); } ElementWisePlugin* clone() const override { - // return new ElementWisePlugin(dims_x_, dims_y_, axis_); - return nullptr; + return new ElementWisePlugin(type_, dims_x_, dims_y_, axis_); } const char* getPluginType() const override { return "elementwise_plugin"; } @@ -58,26 +60,32 @@ class ElementWisePlugin : public PluginTensorRT { int initialize() override; - // execute the layer +#if IS_TRT_VERSION_LT(8000) int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + int enqueue(int batch_size, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream); - protected: - size_t getSerializationSize() override { - return SerializedSize(getPluginType()) + SerializedSize(axis_) + + size_t getSerializationSize() const override { + return getBaseSerializationSize() + SerializedSize(type_.c_str()) + SerializedSize(dims_x_) + SerializedSize(dims_y_) + - getBaseSerializationSize(); + SerializedSize(axis_) + SerializedSize(prev_size_) + + SerializedSize(midd_size_) + SerializedSize(post_size_); } - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, type_.c_str()); - SerializeValue(&buffer, axis_); SerializeValue(&buffer, dims_x_); SerializeValue(&buffer, dims_y_); + SerializeValue(&buffer, axis_); + SerializeValue(&buffer, prev_size_); + SerializeValue(&buffer, midd_size_); + SerializeValue(&buffer, post_size_); } + protected: std::string type_; nvinfer1::Dims dims_x_; nvinfer1::Dims dims_y_; @@ -87,6 +95,20 @@ class ElementWisePlugin : public PluginTensorRT { int post_size_; }; +class ElementWisePluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "elementwise_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new ElementWisePlugin(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(ElementWisePluginCreator); + #if IS_TRT_VERSION_GE(6000) class ElementwisePluginDynamic : public DynamicPluginTensorRT { public: @@ -102,7 +124,9 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { return new ElementwisePluginDynamic(type_, axis_); } - const char* getPluginType() const override { return "elementwise_plugin"; } + const char* getPluginType() const override { + return "elementwise_plugin_dynamic"; + } int getNbOutputs() const override { return 1; } int initialize() override; @@ -147,7 +171,9 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator { public: ElementwisePluginDynamicCreator() {} - const char* getPluginName() const override { return "elementwise_plugin"; } + const char* getPluginName() const override { + return "elementwise_plugin_dynamic"; + } const char* getPluginVersion() const override { return "1"; } diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu index 6d3872aaeb8a77acf1455e4d5e555ee01d36478a..79fc3d66bbe4dd71e48319861868cc705e5d6dfd 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu @@ -20,7 +20,6 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" namespace paddle { @@ -134,7 +133,7 @@ int EmbEltwiseLayernormPluginDynamicImpl::enqueue( int batch = id_dims.d[0]; int seq_len = id_dims.d[1]; int input_num = embs_.size(); - + cudaGetDevice(&device_id_); auto in_ptr_gpu_d = in_ptr_tensor_.mutable_data(platform::CUDAPlace(device_id_)); auto emb_ptr_gpu_d = diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu new file mode 100644 index 0000000000000000000000000000000000000000..933ca333cdbb93c047d4390023de29d434753074 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu @@ -0,0 +1,228 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "NvInferRuntimeCommon.h" +#include "paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) + +template +__global__ void GatherNdCUDAKernel(const T* input, const int32_t* input_dims, + const IndexT* indices, T* output, + int32_t remain_size, int32_t slice_size, + int32_t end_size) { + CUDA_KERNEL_LOOP(i, remain_size * slice_size) { + int indices_i = i / slice_size; + int slice_i = i - indices_i * slice_size; // offset inside the slice + IndexT gather_i = 0; + int32_t temp = slice_size; + for (int32_t j = end_size - 1; j >= 0; --j) { + auto index_value = indices[indices_i * end_size + j]; + PADDLE_ENFORCE( + index_value >= 0 && index_value < input_dims[j], + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be less than [%d] and greater or equal to 0, but received [%d]", + input_dims[j], index_value); + gather_i += (index_value * temp); + temp *= input_dims[j]; + } + IndexT input_i = gather_i + slice_i; + *(output + i) = *(input + input_i); + } +} + +int GatherNdPluginDynamic::initialize() { return 0; } + +size_t GatherNdPluginDynamic::getSerializationSize() const { + return SerializedSize(with_fp16_); +} + +void GatherNdPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, with_fp16_); +} + +nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) { + PADDLE_ENFORCE_EQ( + nb_inputs, 2, + platform::errors::InvalidArgument( + "The gather_nd plugin should have 2 input, but got %d.", nb_inputs)); + PADDLE_ENFORCE_EQ(output_index, 0, + platform::errors::InvalidArgument( + "When GetOutputDimensions in gather_nd " + "plugin, the output_index should be 0.")); + + nvinfer1::DimsExprs x_dims = inputs[0]; + nvinfer1::DimsExprs index_dims = inputs[1]; + + int32_t x_dims_size = x_dims.nbDims; + int32_t index_dims_size = index_dims.nbDims; + + // TODO(wilber): The result dims shoule be Index.shape[:-1] + + // X.shape[Index.shape[-1]:], but the trt DimsExprs is an expression we can't + // get the actual value. So we only support one scenario: input_dims.size == + // index_dims.size. + nvinfer1::DimsExprs ret(x_dims); + for (int i = 0; i < index_dims_size - 1; ++i) { + ret.d[i] = index_dims.d[i]; + } + + return ret; +} + +bool GatherNdPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs, + int nb_outputs) { + PADDLE_ENFORCE_NOT_NULL( + in_out, platform::errors::InvalidArgument( + "The input of gather_nd plugin should not be nullptr.")); + + PADDLE_ENFORCE_LT( + pos, nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, nb_inputs + nb_outputs)); + (in_out && pos < (nb_inputs + nb_outputs)); + + const nvinfer1::PluginTensorDesc& in = in_out[pos]; + if (pos == 0) { + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } else if (pos == 1) { + return in.type == nvinfer1::DataType::kINT32 && + in.format == nvinfer1::TensorFormat::kLINEAR; + } else if (pos == 2) { + return in.type == in_out[0].type && + in.format == nvinfer1::TensorFormat::kLINEAR; + } + + return true; +} + +nvinfer1::DataType GatherNdPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* input_types, int nb_inputs) const { + return input_types[0]; +} + +int GatherNdPluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + auto input_dims = input_desc[0].dims; + auto index_dims = input_desc[1].dims; + auto input_dims_size = input_dims.nbDims; + auto index_dims_size = index_dims.nbDims; + + std::vector input_shape, index_shape, out_shape; + for (int i = 0; i < input_dims.nbDims; i++) + input_shape.push_back(input_dims.d[i]); + for (int i = 0; i < index_dims.nbDims; i++) + index_shape.push_back(index_dims.d[i]); + // The out_shape is + // Index.shape[:-1] + X.shape[Index.shape[-1]:] + for (int i = 0; i < index_dims_size - 1; ++i) { + out_shape.emplace_back(index_shape[i]); + } + for (int i = index_shape[index_dims_size - 1]; i < input_dims_size; ++i) { + out_shape.emplace_back(input_shape[i]); + } + + // final dim + int end_size = index_shape[index_dims_size - 1]; + // remain dim + std::vector remain_ddim(index_shape.begin(), index_shape.end() - 1); + int remain_numel = std::accumulate(remain_ddim.begin(), remain_ddim.end(), 1, + std::multiplies()); + // slice size + int slice_size = 1; + for (int i = end_size; i < input_dims_size; ++i) { + slice_size *= input_shape[i]; + } + + auto input_type = input_desc[0].type; + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp32"; + + const float* p_input = static_cast(inputs[0]); + const int32_t* p_index = static_cast(inputs[1]); + float* p_output = static_cast(outputs[0]); + + if (input_dims_data_ == nullptr) { + cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int)); + } + cudaMemcpyAsync(input_dims_data_, input_shape.data(), + sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice, + stream); + + int block = 512; + int n = slice_size * remain_numel; + int grid = (n + block - 1) / block; + + GatherNdCUDAKernel<<>>( + p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size, + end_size); + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. gather_nd-->fp16"; + + const half* p_input = static_cast(inputs[0]); + const int32_t* p_index = static_cast(inputs[1]); + half* p_output = static_cast(outputs[0]); + + if (input_dims_data_ == nullptr) { + cudaMalloc(&input_dims_data_, input_shape.size() * sizeof(int)); + } + cudaMemcpyAsync(input_dims_data_, input_shape.data(), + sizeof(int) * input_shape.size(), cudaMemcpyHostToDevice, + stream); + + int block = 512; + int n = slice_size * remain_numel; + int grid = (n + block - 1) / block; + + GatherNdCUDAKernel<<>>( + p_input, input_dims_data_, p_index, p_output, remain_numel, slice_size, + end_size); + } + + return cudaGetLastError() != cudaSuccess; +} +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..0a242238c81fb3b34888905a393bc992179712b2 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h @@ -0,0 +1,132 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) +class GatherNdPluginDynamic : public DynamicPluginTensorRT { + public: + explicit GatherNdPluginDynamic(bool with_fp16) { with_fp16_ = with_fp16; } + + GatherNdPluginDynamic(void const* serial_data, size_t serial_length) { + DeserializeValue(&serial_data, &serial_length, &with_fp16_); + } + + nvinfer1::IPluginV2DynamicExt* clone() const override { + return new GatherNdPluginDynamic(with_fp16_); + } + + const char* getPluginType() const override { return "gather_nd_plugin"; } + int getNbOutputs() const override { return 1; } + int initialize() override; + + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override {} + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override { + return 0; + } + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + + void destroy() override { + if (input_dims_data_) { + cudaFree(input_dims_data_); + } + delete this; + } + + private: + int32_t* input_dims_data_{nullptr}; +}; + +class GatherNdPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + GatherNdPluginDynamicCreator() {} + const char* getPluginName() const override { return "gather_nd_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new GatherNdPluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(GatherNdPluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu index deda2e2cc7247f404ff6d11409b665898d550ee1..43557c341ef42e5bcccc29fec259f2625b4ceeb2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu @@ -16,7 +16,6 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -31,21 +30,15 @@ static const float kAT = 0.5; static const float kBT = 0.7978845608028654; // sqrt(2.0/M_PI) static const float kCT = 0.035677408136300125; // 0.044715 * sqrt(2.0/M_PI) -GeluPlugin* CreateGeluPluginDeserialize(const void* buffer, size_t length) { - return new GeluPlugin(buffer, length); -} - -REGISTER_TRT_PLUGIN("gelu_plugin", CreateGeluPluginDeserialize); - bool GeluPlugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const { if (with_fp16_) { return ((type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF) && - (format == nvinfer1::PluginFormat::kNCHW)); + (format == nvinfer1::PluginFormat::kLINEAR)); } else { return ((type == nvinfer1::DataType::kFLOAT) && - (format == nvinfer1::PluginFormat::kNCHW)); + (format == nvinfer1::PluginFormat::kLINEAR)); } } @@ -100,7 +93,11 @@ __global__ void no_exact_gelu_kernel(const T a, const T b, const T c, int n, } int GeluPlugin::enqueue(int batch_size, const void* const* inputs, +#if IS_TRT_VERSION_LT(8000) void** outputs, void*, cudaStream_t stream) { +#else + void* const* outputs, void*, cudaStream_t stream) { +#endif const auto& input_dims = this->getInputDims(0); int num = batch_size; for (int i = 0; i < input_dims.nbDims; i++) { diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h index 23e507ee477e1a3b85339c7b267b290de19805ab..6fdd9791a61bdb3ac6b73c2d7e3805325a7f4cf1 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h @@ -44,21 +44,35 @@ class GeluPlugin : public PluginTensorRT { nvinfer1::PluginFormat format) const override; nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nb_input_dims) override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + int enqueue(int batch_size, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; - protected: - size_t getSerializationSize() override { - return getBaseSerializationSize() + SerializedSize(getPluginType()); + size_t getSerializationSize() const override { + return getBaseSerializationSize(); } // TRT will call this func to serialize the configuration of TRT // It should not be called by users. - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); - serializeBase(buffer); + void serialize(void* buffer) const override { serializeBase(buffer); } +}; + +class GeluPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "gelu_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new GeluPlugin(serial_data, serial_length); } }; +REGISTER_TRT_PLUGIN_V2(GeluPluginCreator); #if IS_TRT_VERSION_GE(6000) class GeluPluginDynamic : public DynamicPluginTensorRT { @@ -73,7 +87,7 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { return new GeluPluginDynamic(with_fp16_); } - const char* getPluginType() const override { return "gelu_plugin"; } + const char* getPluginType() const override { return "gelu_plugin_dynamic"; } int getNbOutputs() const override { return 1; } int initialize() override { return 0; } @@ -115,44 +129,19 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { void destroy() override { delete this; } }; -class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator { +class GeluPluginDynamicCreator : public TensorRTPluginCreator { public: - GeluPluginDynamicCreator() {} - const char* getPluginName() const override { return "gelu_plugin"; } + const char* getPluginName() const override { return "gelu_plugin_dynamic"; } const char* getPluginVersion() const override { return "1"; } - const nvinfer1::PluginFieldCollection* getFieldNames() override { - return &field_collection_; - } - - nvinfer1::IPluginV2* createPlugin( - const char* name, const nvinfer1::PluginFieldCollection* fc) override { - return nullptr; - } - nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serial_data, size_t serial_length) override { auto plugin = new GeluPluginDynamic(serial_data, serial_length); return plugin; } - - void setPluginNamespace(const char* lib_namespace) override { - plugin_namespace_ = lib_namespace; - } - - const char* getPluginNamespace() const override { - return plugin_namespace_.c_str(); - } - - private: - std::string plugin_namespace_; - std::string plugin_name_; - nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; - std::vector plugin_attributes_; }; - REGISTER_TRT_PLUGIN_V2(GeluPluginDynamicCreator); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu index 8b2d0ac3cf70f77f1ff9ce9a6fe2ed19fdcf9576..dab7ddac1957a1aa62a4edd18f97b9601b2d56aa 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu @@ -15,20 +15,12 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { -HardSwishPlugin* CreateHardSwishPluginDeserialize(const void* buffer, - size_t length) { - return new HardSwishPlugin(buffer, length); -} - -REGISTER_TRT_PLUGIN("hard_swish_plugin", CreateHardSwishPluginDeserialize); - nvinfer1::Dims HardSwishPlugin::getOutputDimensions( int index, const nvinfer1::Dims* in_dims, int nb_inputs) { assert(nb_inputs == 1); @@ -59,7 +51,11 @@ __global__ void hard_swish_kernel(float threshold, float scale, float offset, } int HardSwishPlugin::enqueue(int batch_size, const void* const* inputs, +#if IS_TRT_VERSION_LT(8000) void** outputs, void*, cudaStream_t stream) { +#else + void* const* outputs, void*, cudaStream_t stream) { +#endif const auto& input_dims = this->getInputDims(0); int num = batch_size; for (int i = 0; i < input_dims.nbDims; i++) { diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h index 2e1e1d03baf7e1cb046f887f2d799a907f3586d4..42c47959988a500043534d3af228f073ba202536 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h @@ -49,30 +49,46 @@ class HardSwishPlugin : public PluginTensorRT { int initialize() override { return 0; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; - protected: - float threshold_; - float scale_; - float offset_; - - size_t getSerializationSize() override { + size_t getSerializationSize() const override { return getBaseSerializationSize() + SerializedSize(threshold_) + - SerializedSize(scale_) + SerializedSize(offset_) + - SerializedSize(getPluginType()); + SerializedSize(scale_) + SerializedSize(offset_); } // TRT will call this func to serialize the configuration of TRT // It should not be called by users. - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, threshold_); SerializeValue(&buffer, scale_); SerializeValue(&buffer, offset_); } + + protected: + float threshold_; + float scale_; + float offset_; +}; + +class HardSwishPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "hard_swish_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new HardSwishPlugin(serial_data, serial_length); + } }; +REGISTER_TRT_PLUGIN_V2(HardSwishPluginCreator); } // namespace plugin } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu index a579743ee8ad1a9ae480cebf03380635c3a300c4..13aa6df643e82aa5d52abb06a4127d75a8664779 100644 --- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu @@ -17,7 +17,6 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { @@ -40,13 +39,6 @@ cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype, return CUDNN_STATUS_SUCCESS; } -InstanceNormPlugin *CreateInstanceNormPluginDeserialize(const void *buffer, - size_t length) { - return new InstanceNormPlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("instance_norm_plugin", - CreateInstanceNormPluginDeserialize); - int InstanceNormPlugin::initialize() { return 0; } nvinfer1::Dims InstanceNormPlugin::getOutputDimensions( @@ -58,8 +50,19 @@ nvinfer1::Dims InstanceNormPlugin::getOutputDimensions( return output_dims; } +bool InstanceNormPlugin::supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kLINEAR)); +} + int InstanceNormPlugin::enqueue(int batch_size, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, +#else + void *const *outputs, void *workspace, +#endif cudaStream_t stream) { const auto &input_dims = this->getInputDims(0); diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h index 83422708f593d8fef66bb2d3b463ede80f041398..f9dab09beebd3a11dd008cdf693a47f043981acc 100644 --- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h @@ -38,25 +38,22 @@ class InstanceNormPlugin : public PluginTensorRT { cudnnHandle_t handle_; cudnnTensorDescriptor_t x_desc_, y_desc_, b_desc_; - protected: - size_t getSerializationSize() override { + public: + size_t getSerializationSize() const override { return getBaseSerializationSize() + SerializedSize(eps_) + - SerializedSize(scale_) + SerializedSize(bias_) + - SerializedSize(getPluginType()); + SerializedSize(scale_) + SerializedSize(bias_); } // TRT will call this func when we need to serialize the configuration of // tensorrt. // It should not be called by users. - void serialize(void *buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void *buffer) const override { serializeBase(buffer); SerializeValue(&buffer, eps_); SerializeValue(&buffer, scale_); SerializeValue(&buffer, bias_); } - public: explicit InstanceNormPlugin(const float eps, const std::vector scale, const std::vector bias) : eps_(eps), scale_(scale), bias_(bias) { @@ -91,6 +88,7 @@ class InstanceNormPlugin : public PluginTensorRT { platform::dynload::cudnnDestroyTensorDescriptor(y_desc_); platform::dynload::cudnnDestroyTensorDescriptor(b_desc_); } + int initialize() override; InstanceNormPlugin *clone() const override { @@ -101,16 +99,31 @@ class InstanceNormPlugin : public PluginTensorRT { int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; + +#if IS_TRT_VERSION_LT(8000) int enqueue(int batchSize, const void *const *inputs, void **outputs, +#else + int enqueue(int batchSize, const void *const *inputs, void *const *outputs, +#endif void *workspace, cudaStream_t stream) override; bool supportsFormat(nvinfer1::DataType type, - nvinfer1::PluginFormat format) const override { - return ((type == nvinfer1::DataType::kFLOAT || - type == nvinfer1::DataType::kHALF) && - (format == nvinfer1::PluginFormat::kNCHW)); + nvinfer1::PluginFormat format) const override; +}; + +class InstanceNormPluginCreator : public TensorRTPluginCreator { + public: + const char *getPluginName() const override { return "instance_norm_plugin"; } + + const char *getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2 *deserializePlugin(const char *name, + const void *serial_data, + size_t serial_length) override { + return new InstanceNormPlugin(serial_data, serial_length); } }; +REGISTER_TRT_PLUGIN_V2(InstanceNormPluginCreator); } // namespace plugin } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu index 8af036a0e86709336b0ef8b3310442cb7374bfbc..2688380726f78e299b6169f30f01bb691d73361f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu @@ -17,7 +17,6 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/layer_norm_op.h" namespace paddle { @@ -25,12 +24,6 @@ namespace inference { namespace tensorrt { namespace plugin { -LayerNormPlugin *CreateLayerNormPluginDeserialize(const void *buffer, - size_t length) { - return new LayerNormPlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("layer_norm_plugin", CreateLayerNormPluginDeserialize); - int LayerNormPlugin::initialize() { return 0; } nvinfer1::Dims LayerNormPlugin::getOutputDimensions( @@ -43,7 +36,11 @@ nvinfer1::Dims LayerNormPlugin::getOutputDimensions( } int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, +#else + void *const *outputs, void *workspace, +#endif cudaStream_t stream) { const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); @@ -57,8 +54,18 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, input_shape.push_back(input_dims.d[i]); } const auto input_ddim = framework::make_ddim(input_shape); - auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis - 1); + auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis); int feature_size = static_cast(matrix_dim[1]); + PADDLE_ENFORCE_EQ(feature_size, scale_.size(), + platform::errors::InvalidArgument( + "scale's size should be equal to the feature_size," + "but got feature_size:%d, scale's size:%d.", + feature_size, scale_.size())); + PADDLE_ENFORCE_EQ(feature_size, bias_.size(), + platform::errors::InvalidArgument( + "bias's size should be equal to the feature_size," + "but got feature_size:%d, bias's size:%d.", + feature_size, bias_.size())); scale_t.Resize(framework::make_ddim({feature_size})); bias_t.Resize(framework::make_ddim({feature_size})); @@ -82,6 +89,103 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs, return cudaGetLastError() != cudaSuccess; } +nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions( + int output_index, const nvinfer1::DimsExprs *inputDims, int nb_inputs, + nvinfer1::IExprBuilder &expr_builder) { + return inputDims[0]; +} + +bool LayerNormPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs, + int nb_outputs) { + PADDLE_ENFORCE_NOT_NULL( + in_out, platform::errors::InvalidArgument( + "The input of layernorm plugin shoule not be nullptr.")); + PADDLE_ENFORCE_LT( + pos, nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, nb_inputs + nb_outputs)); + const nvinfer1::PluginTensorDesc &in = in_out[pos]; + if (pos == 0) { + // TODO(Shangzhizhou) FP16 support + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1]; + // output + return in.type == prev.type && in.format == prev.format; +} + +nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType *input_types, int nb_inputs) const { + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The LayerNormPlugin only has one input, so the " + "index value should be 0, but get %d.", + index)); + return input_types[0]; +} + +int LayerNormPluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc *input_desc, + const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs, + void *const *outputs, void *workspace, cudaStream_t stream) { + const auto &input_dims = input_desc[0].dims; + int begin_norm_axis = begin_norm_axis_; + float eps = eps_; + + std::vector input_shape; + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape.push_back(input_dims.d[i]); + } + const auto input_ddim = framework::make_ddim(input_shape); + auto matrix_dim = framework::flatten_to_2d(input_ddim, begin_norm_axis); + int feature_size = static_cast(matrix_dim[1]); + PADDLE_ENFORCE_EQ(feature_size, scale_.size(), + platform::errors::InvalidArgument( + "scale's size should be equal to the feature_size," + "but got feature_size:%d, scale's size:%d.", + feature_size, scale_.size())); + PADDLE_ENFORCE_EQ(feature_size, bias_.size(), + platform::errors::InvalidArgument( + "bias's size should be equal to the feature_size," + "but got feature_size:%d, bias's size:%d.", + feature_size, bias_.size())); + int device_id; + cudaGetDevice(&device_id); + auto input_type = input_desc[0].type; + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32"; + const float *input = reinterpret_cast(inputs[0]); + float *output = static_cast(outputs[0]); + scale_t.Resize(framework::make_ddim({feature_size})); + bias_t.Resize(framework::make_ddim({feature_size})); + mean_t.Resize(framework::make_ddim(mean_shape_)); + variance_t.Resize(framework::make_ddim(variance_shape_)); + + float *scale_d = + scale_t.mutable_data(platform::CUDAPlace(device_id)); + float *bias_d = bias_t.mutable_data(platform::CUDAPlace(device_id)); + float *mean_d = mean_t.mutable_data(platform::CUDAPlace(device_id)); + float *variance_d = + variance_t.mutable_data(platform::CUDAPlace(device_id)); + + cudaMemcpyAsync(scale_d, scale_.data(), sizeof(float) * feature_size, + cudaMemcpyHostToDevice, stream); + cudaMemcpyAsync(bias_d, bias_.data(), sizeof(float) * feature_size, + cudaMemcpyHostToDevice, stream); + + paddle::operators::LayerNormDirectCUDAFunctor layer_norm; + layer_norm(stream, input, input_shape, bias_d, scale_d, output, mean_d, + variance_d, begin_norm_axis, eps); + } else { + PADDLE_THROW(platform::errors::Fatal( + "The LayerNorm TRT Plugin's input type should be float.")); + } + return cudaGetLastError() != cudaSuccess; +} + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h index 050ef3b77d3157f89edee949a3a86923846cc3f7..caa3c21db63fab389f89e300501c2890a2a5f949 100644 --- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h @@ -39,19 +39,18 @@ class LayerNormPlugin : public PluginTensorRT { std::vector mean_shape_; std::vector variance_shape_; - protected: - size_t getSerializationSize() override { + public: + size_t getSerializationSize() const override { return getBaseSerializationSize() + SerializedSize(bias_) + SerializedSize(scale_) + SerializedSize(begin_norm_axis_) + SerializedSize(eps_) + SerializedSize(mean_shape_) + - SerializedSize(variance_shape_) + SerializedSize(getPluginType()); + SerializedSize(variance_shape_); } // TRT will call this func when we need to serialize the configuration of // tensorrt. // It should not be called by users. - void serialize(void *buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, bias_); SerializeValue(&buffer, scale_); @@ -61,8 +60,7 @@ class LayerNormPlugin : public PluginTensorRT { SerializeValue(&buffer, variance_shape_); } - public: - LayerNormPlugin(const float *bias, const int bias_num, const float *scale, + LayerNormPlugin(const float* bias, const int bias_num, const float* scale, const int scale_num, int begin_norm_axis, float eps, std::vector mean_shape, std::vector variance_shape) @@ -78,7 +76,7 @@ class LayerNormPlugin : public PluginTensorRT { // It was used for tensorrt deserialization. // It should not be called by users. - LayerNormPlugin(void const *serialData, size_t serialLength) { + LayerNormPlugin(void const* serialData, size_t serialLength) { deserializeBase(serialData, serialLength); DeserializeValue(&serialData, &serialLength, &bias_); DeserializeValue(&serialData, &serialLength, &scale_); @@ -90,20 +88,150 @@ class LayerNormPlugin : public PluginTensorRT { ~LayerNormPlugin() {} int initialize() override; - LayerNormPlugin *clone() const override { + LayerNormPlugin* clone() const override { return new LayerNormPlugin(bias_.data(), bias_.size(), scale_.data(), scale_.size(), begin_norm_axis_, eps_, mean_shape_, variance_shape_); } - const char *getPluginType() const override { return "layer_norm_plugin"; } + const char* getPluginType() const override { return "layernorm_plugin"; } int getNbOutputs() const override { return 1; } - nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override; - int enqueue(int batchSize, const void *const *inputs, void **outputs, - void *workspace, cudaStream_t stream) override; +#if IS_TRT_VERSION_LT(8000) + int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif + void* workspace, cudaStream_t stream) override; +}; + +class LayerNormPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "layernorm_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new LayerNormPlugin(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(LayerNormPluginCreator); + +class LayerNormPluginDynamic : public DynamicPluginTensorRT { + public: + LayerNormPluginDynamic(const float* bias, const int bias_num, + const float* scale, const int scale_num, + int begin_norm_axis, float eps, + std::vector mean_shape, + std::vector variance_shape) + : begin_norm_axis_(begin_norm_axis), + eps_(eps), + mean_shape_(mean_shape), + variance_shape_(variance_shape) { + bias_.resize(bias_num); + scale_.resize(scale_num); + std::copy(bias, bias + bias_num, bias_.data()); + std::copy(scale, scale + scale_num, scale_.data()); + } + + LayerNormPluginDynamic(void const* serialData, size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &bias_); + DeserializeValue(&serialData, &serialLength, &scale_); + DeserializeValue(&serialData, &serialLength, &begin_norm_axis_); + DeserializeValue(&serialData, &serialLength, &eps_); + DeserializeValue(&serialData, &serialLength, &mean_shape_); + DeserializeValue(&serialData, &serialLength, &variance_shape_); + } + nvinfer1::IPluginV2DynamicExt* clone() const override { + return new LayerNormPluginDynamic(bias_.data(), bias_.size(), scale_.data(), + scale_.size(), begin_norm_axis_, eps_, + mean_shape_, variance_shape_); + } + + const char* getPluginType() const override { + return "layernorm_plugin_dynamic"; + } + int getNbOutputs() const override { return 1; } + int initialize() override { return 0; } + + size_t getSerializationSize() const override { + return SerializedSize(bias_) + SerializedSize(scale_) + + SerializedSize(begin_norm_axis_) + SerializedSize(eps_) + + SerializedSize(mean_shape_) + SerializedSize(variance_shape_); + } + + void serialize(void* buffer) const override { + SerializeValue(&buffer, bias_); + SerializeValue(&buffer, scale_); + SerializeValue(&buffer, begin_norm_axis_); + SerializeValue(&buffer, eps_); + SerializeValue(&buffer, mean_shape_); + SerializeValue(&buffer, variance_shape_); + } + + nvinfer1::DimsExprs getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override {} + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override { + return 0; + } + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + + void destroy() override { delete this; } + + private: + std::vector bias_; + std::vector scale_; + framework::Tensor scale_t; + framework::Tensor bias_t; + framework::Tensor mean_t; + framework::Tensor variance_t; + int begin_norm_axis_; + float eps_; + std::vector mean_shape_; + std::vector variance_shape_; }; +class LayerNormPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { + return "layernorm_plugin_dynamic"; + } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new LayerNormPluginDynamic(serial_data, serial_length); + } +}; + +REGISTER_TRT_PLUGIN_V2(LayerNormPluginDynamicCreator); + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu index 154f61a2b7cd3f066cc1a671f8277232fde65a9d..7e1d18227e232588197ff405b4e86032ff9586d6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/pooling.h" namespace paddle { @@ -21,11 +20,6 @@ namespace inference { namespace tensorrt { namespace plugin { -PoolPlugin *CreatePoolPluginDeserialize(const void *buffer, size_t length) { - return new PoolPlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("pool_plugin", CreatePoolPluginDeserialize); - nvinfer1::Dims PoolPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -42,7 +36,12 @@ nvinfer1::Dims PoolPlugin::getOutputDimensions(int index, } int PoolPlugin::enqueue(int batchSize, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, cudaStream_t stream) { +#else + void *const *outputs, void *workspace, + cudaStream_t stream) { +#endif auto const &input_dims = this->getInputDims(0); int input_size = 0; float const *idata = reinterpret_cast(inputs[0]); @@ -75,9 +74,35 @@ int PoolPlugin::enqueue(int batchSize, const void *const *inputs, // Dynamic Plugin below. #if IS_TRT_VERSION_GE(6000) -size_t PoolPluginDynamic::getSerializationSize() const { return 0; } +PoolPluginDynamic::PoolPluginDynamic(void const *serialData, + size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &ceil_mode_); + const char *pool_type; + DeserializeValue(&serialData, &serialLength, &pool_type); + pool_type_ = std::string(pool_type); + DeserializeValue(&serialData, &serialLength, &adaptive_); + DeserializeValue(&serialData, &serialLength, &ksize_); + DeserializeValue(&serialData, &serialLength, &strides_); + DeserializeValue(&serialData, &serialLength, &paddings_); + DeserializeValue(&serialData, &serialLength, &is_global_); +} -void PoolPluginDynamic::serialize(void *buffer) const {} +size_t PoolPluginDynamic::getSerializationSize() const { + return SerializedSize(ceil_mode_) + SerializedSize(pool_type_.c_str()) + + SerializedSize(adaptive_) + SerializedSize(ksize_) + + SerializedSize(strides_) + SerializedSize(paddings_) + + SerializedSize(is_global_); +} + +void PoolPluginDynamic::serialize(void *buffer) const { + SerializeValue(&buffer, ceil_mode_); + SerializeValue(&buffer, pool_type_.c_str()); + SerializeValue(&buffer, adaptive_); + SerializeValue(&buffer, ksize_); + SerializeValue(&buffer, strides_); + SerializeValue(&buffer, paddings_); + SerializeValue(&buffer, is_global_); +} nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, @@ -169,7 +194,7 @@ bool PoolPluginDynamic::supportsFormatCombination( (in_out && pos < (nb_inputs + nb_outputs)); return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) && - in_out[pos].format == nvinfer1::PluginFormat::kNCHW); + in_out[pos].format == nvinfer1::PluginFormat::kLINEAR); } nvinfer1::DataType PoolPluginDynamic::getOutputDataType( diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h index 6693a1fae4d4304af2f826894b119383ea704727..7c12796805c5d1a87f9a798d1a353be76f4a6e53 100644 --- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h @@ -56,19 +56,18 @@ static std::vector CalcOutputSize(const std::vector& input_shape, } class PoolPlugin : public PluginTensorRT { - protected: - size_t getSerializationSize() override { - return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) + + public: + size_t getSerializationSize() const override { + return getBaseSerializationSize() + SerializedSize(ceil_mode_) + SerializedSize(pool_type_) + SerializedSize(adaptive_) + SerializedSize(ksize_) + SerializedSize(strides_) + SerializedSize(paddings_) + SerializedSize(input_shape_) + - SerializedSize(output_shape_) + getBaseSerializationSize(); + SerializedSize(output_shape_); } // TRT will call this func when we need to serialize the configuration of // tensorrt. - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, pool_type_); @@ -80,7 +79,6 @@ class PoolPlugin : public PluginTensorRT { SerializeValue(&buffer, output_shape_); } - public: enum class PoolType { max = 0, avg, @@ -128,7 +126,11 @@ class PoolPlugin : public PluginTensorRT { nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override; int initialize() override { return 0; } +#if IS_TRT_VERSION_LT(8000) int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; private: @@ -142,6 +144,20 @@ class PoolPlugin : public PluginTensorRT { std::vector output_shape_; }; +class PoolPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "pool_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new PoolPlugin(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(PoolPluginCreator); + #if IS_TRT_VERSION_GE(6000) class PoolPluginDynamic : public DynamicPluginTensorRT { public: @@ -158,25 +174,14 @@ class PoolPluginDynamic : public DynamicPluginTensorRT { paddings_(paddings), is_global_(is_global) {} - PoolPluginDynamic(void const* serialData, size_t serialLength) { - deserializeBase(serialData, serialLength); - DeserializeValue(&serialData, &serialLength, &ceil_mode_); - const char* pool_type; - DeserializeValue(&serialData, &serialLength, &pool_type); - pool_type_ = std::string(pool_type); - DeserializeValue(&serialData, &serialLength, &adaptive_); - DeserializeValue(&serialData, &serialLength, &ksize_); - DeserializeValue(&serialData, &serialLength, &strides_); - DeserializeValue(&serialData, &serialLength, &paddings_); - DeserializeValue(&serialData, &serialLength, &is_global_); - } + PoolPluginDynamic(void const* serialData, size_t serialLength); ~PoolPluginDynamic() {} nvinfer1::IPluginV2DynamicExt* clone() const override { return new PoolPluginDynamic(ceil_mode_, pool_type_, adaptive_, ksize_, strides_, paddings_, is_global_); } - const char* getPluginType() const override { return "pool_plugin"; } + const char* getPluginType() const override { return "pool_plugin_dynamic"; } int getNbOutputs() const override { return 1; } int initialize() override { return 0; } @@ -222,6 +227,20 @@ class PoolPluginDynamic : public DynamicPluginTensorRT { std::vector paddings_; bool is_global_; }; + +class PoolPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "pool_plugin_dynamic"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new PoolPluginDynamic(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(PoolPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index 00182b87e984fc3c43f46a3fcb2b9d828db4b170..1882084a8f51699cddaae192365d64cea0c0d41d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -19,7 +19,6 @@ #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/prelu.h" namespace paddle { @@ -27,11 +26,6 @@ namespace inference { namespace tensorrt { namespace plugin { -PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) { - return new PReluPlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize); - int PReluPlugin::initialize() { cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), @@ -57,7 +51,12 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, } int PReluPlugin::enqueue(int batch_size, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, cudaStream_t stream) { +#else + void *const *outputs, void *workspace, + cudaStream_t stream) { +#endif // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); @@ -99,9 +98,23 @@ int PReluPluginDynamic::initialize() { cudaMemcpyHostToDevice); return 0; } -size_t PReluPluginDynamic::getSerializationSize() const { return 0; } -void PReluPluginDynamic::serialize(void *buffer) const {} +PReluPluginDynamic::PReluPluginDynamic(void const *serialData, + size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &weight_); + const char *prelu_mode; + DeserializeValue(&serialData, &serialLength, &prelu_mode); + mode_ = std::string(prelu_mode); +} + +size_t PReluPluginDynamic::getSerializationSize() const { + return SerializedSize(mode_.c_str()) + SerializedSize(weight_); +} + +void PReluPluginDynamic::serialize(void *buffer) const { + SerializeValue(&buffer, weight_); + SerializeValue(&buffer, mode_.c_str()); +} nvinfer1::DimsExprs PReluPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, @@ -124,7 +137,7 @@ bool PReluPluginDynamic::supportsFormatCombination( (in_out && pos < (nb_inputs + nb_outputs)); return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) && - in_out[pos].format == nvinfer1::PluginFormat::kNCHW); + in_out[pos].format == nvinfer1::PluginFormat::kLINEAR); } nvinfer1::DataType PReluPluginDynamic::getOutputDataType( diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h index a0a24e70a01ef47fa71d9d79f7cc2554a60683d0..e3f05bdbe85a1b84ee7e230a3191ccf235466b34 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -33,23 +33,21 @@ class PReluPlugin : public PluginTensorRT { float* p_gpu_weight_; std::string mode_; - protected: - size_t getSerializationSize() override { + public: + size_t getSerializationSize() const override { return getBaseSerializationSize() + SerializedSize(mode_.c_str()) + - SerializedSize(weight_) + SerializedSize(getPluginType()); + SerializedSize(weight_); } // TRT will call this func when we need to serialize the configuration of // tensorrt. // It should not be called by users. - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, weight_); SerializeValue(&buffer, mode_.c_str()); } - public: PReluPlugin(const float* weight, const int weight_num, std::string const& mode) : mode_(mode) { @@ -80,10 +78,28 @@ class PReluPlugin : public PluginTensorRT { int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; }; +class PReluPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "prelu_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new PReluPlugin(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(PReluPluginCreator); + #if IS_TRT_VERSION_GE(6000) class PReluPluginDynamic : public DynamicPluginTensorRT { public: @@ -94,15 +110,7 @@ class PReluPluginDynamic : public DynamicPluginTensorRT { std::copy(weight, weight + weight_num, weight_.data()); } - // It was used for tensorrt deserialization. - // It should not be called by users. - PReluPluginDynamic(void const* serialData, size_t serialLength) { - deserializeBase(serialData, serialLength); - DeserializeValue(&serialData, &serialLength, &weight_); - const char* prelu_mode; - DeserializeValue(&serialData, &serialLength, &prelu_mode); - mode_ = std::string(prelu_mode); - } + PReluPluginDynamic(void const* serialData, size_t serialLength); ~PReluPluginDynamic() {} nvinfer1::IPluginV2DynamicExt* clone() const override { auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_); @@ -110,7 +118,7 @@ class PReluPluginDynamic : public DynamicPluginTensorRT { return ptr; } - const char* getPluginType() const override { return "prelu_plugin"; } + const char* getPluginType() const override { return "prelu_plugin_dynamic"; } int getNbOutputs() const override { return 1; } int initialize() override; void terminate() override; @@ -155,6 +163,20 @@ class PReluPluginDynamic : public DynamicPluginTensorRT { }; #endif +class PReluPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "prelu_plugin_dynamic"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new PReluPluginDynamic(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(PReluPluginDynamicCreator); + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index a5fc9e73c5f27f1280171966df853675e2f0d73b..0d9e5417263f3b299d13d25f16fd8a446447f051 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -20,7 +20,6 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/blas.h" @@ -225,6 +224,14 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType( return input_types[0]; } +template +__global__ void apply_scale(T *data, T scale, int n) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + data[tid] = data[tid] * scale; +#endif +} + int QkvToContextPluginDynamic::enqueue( const nvinfer1::PluginTensorDesc *input_desc, const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs, @@ -291,10 +298,17 @@ int QkvToContextPluginDynamic::enqueue( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(device_id))); + int n_q = seq_len * head_number_ * head_size_ * batch; + constexpr int threads = 128; + int blocks = (n_q + threads - 1) / threads; + + apply_scale<<>>(tptr, static_cast(scale_), + n_q); + const platform::CUDADeviceContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func; multihead_compute_func(dev_ctx, batch, seq_len, head_number_, head_size_, - qkptr, input1_data, tptr, half(scale_), half(0.0)); + qkptr, input1_data, tptr, half(1.), half(0.0)); int grid = batch * head_number_ * seq_len; int block = head_size_; diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu index 6e7ed0054f502ea014d3648ac0be22c167987735..5ec6e5af86daf19c4d79eb18d72a89d1f71f8393 100644 --- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -17,7 +17,6 @@ #include #include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { @@ -304,7 +303,7 @@ int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType( int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { - return data_type_; + return inputTypes[0]; } const char* RoiAlignPluginDynamic::getPluginType() const { diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu index 7be9e3a740ab1c3532f5a67f06048c6c745eb214..346b4c680830e92a9d78fdaa6c124aac13755c3b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu @@ -19,7 +19,6 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" namespace paddle { diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu index b44b3face92e14fc49732621d5397a6fdcf859a2..70ff0e7cb069d7f64784b2e3065327ea2b294d10 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu @@ -19,18 +19,12 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { -SlicePlugin *CreateSlicePluginDeserialize(const void *buffer, size_t length) { - return new SlicePlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("slice_plugin", CreateSlicePluginDeserialize); - template __global__ void SliceKernel(int num, int dims, const T *input, const int *offsets_info, T *output) { @@ -90,10 +84,10 @@ bool SlicePlugin::supportsFormat(nvinfer1::DataType type, if (with_fp16_) { return ((type == nvinfer1::DataType::kFLOAT || type == nvinfer1::DataType::kHALF) && - (format == nvinfer1::PluginFormat::kNCHW)); + (format == nvinfer1::PluginFormat::kLINEAR)); } else { return ((type == nvinfer1::DataType::kFLOAT) && - (format == nvinfer1::PluginFormat::kNCHW)); + (format == nvinfer1::PluginFormat::kLINEAR)); } } @@ -111,7 +105,12 @@ nvinfer1::Dims SlicePlugin::getOutputDimensions(int index, } int SlicePlugin::enqueue(int batch_size, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, cudaStream_t stream) { +#else + void *const *outputs, void *workspace, + cudaStream_t stream) { +#endif auto input_dims = getInputDims(0); // notice input dims is [C, H, W], add input batch dim here @@ -188,13 +187,13 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs, return cudaGetLastError() != cudaSuccess; } -size_t SlicePlugin::getSerializationSize() { +size_t SlicePlugin::getSerializationSize() const { return getBaseSerializationSize() + SerializedSize(getPluginType()) + SerializedSize(starts_) + SerializedSize(ends_) + SerializedSize(axes_); } -void SlicePlugin::serialize(void *buffer) { +void SlicePlugin::serialize(void *buffer) const { SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, starts_); diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h index 9d4f9a35c3b6fe02981853eb3c0a697d5cb3a199..b656918f8fbab460c6e029c8d95d97eca250c96a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h @@ -44,15 +44,18 @@ class SlicePlugin : public PluginTensorRT { nvinfer1::PluginFormat format) const override; nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nb_input_dims) override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + int enqueue(int batch_size, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; - protected: - size_t getSerializationSize() override; + size_t getSerializationSize() const override; // TRT will call this func to serialize the configuration of TRT // It should not be called by users. - void serialize(void* buffer) override; + void serialize(void* buffer) const override; private: std::vector starts_; @@ -63,6 +66,20 @@ class SlicePlugin : public PluginTensorRT { cudaStream_t copy_stream_; }; +class SlicePluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "slice_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new SlicePlugin(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(SlicePluginCreator); + #if IS_TRT_VERSION_GE(6000) class SlicePluginDynamic : public DynamicPluginTensorRT { public: @@ -75,7 +92,7 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { SlicePluginDynamic(void const* serialData, size_t serialLength); - const char* getPluginType() const override { return "slice_plugin"; } + const char* getPluginType() const override { return "slice_plugin_dynamic"; } int getNbOutputs() const override { return 1; } int initialize() override; @@ -121,40 +138,18 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { cudaStream_t copy_stream_; }; -class SlicePluginDynamicCreator : public nvinfer1::IPluginCreator { +class SlicePluginDynamicCreator : public TensorRTPluginCreator { public: - SlicePluginDynamicCreator() {} - const char* getPluginName() const override { return "slice_plugin"; } + const char* getPluginName() const override { return "slice_plugin_dynamic"; } const char* getPluginVersion() const override { return "1"; } - const nvinfer1::PluginFieldCollection* getFieldNames() override { - return &field_collection_; - } - - nvinfer1::IPluginV2* createPlugin( - const char* name, const nvinfer1::PluginFieldCollection* fc) override { - return nullptr; - } - nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serialData, size_t serialLength) override { - auto plugin = new SlicePluginDynamic(serialData, serialLength); - return plugin; + return new SlicePluginDynamic(serialData, serialLength); } - - void setPluginNamespace(const char* libNamespace) override { - namespace_ = libNamespace; - } - - const char* getPluginNamespace() const override { return namespace_.c_str(); } - - private: - std::string namespace_; - nvinfer1::PluginFieldCollection field_collection_; }; - REGISTER_TRT_PLUGIN_V2(SlicePluginDynamicCreator); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu index fdb14f9ceaf29fe90cd756b77e7c5afff2296f44..3bef9672e5058ad7210beac47fbd83be7c4f6065 100644 --- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu @@ -16,7 +16,6 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 1b5c39f8fff855fac4ef8f2ee54faa872023ad05..37afff9105d80a331ef7f0a335e0f07d683e93e5 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -15,7 +15,6 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { @@ -126,7 +125,12 @@ __global__ void split_kernel(int nsegment, } int SplitPlugin::enqueue(int batchSize, const void* const* inputs, +#if IS_TRT_VERSION_LT(8000) void** outputs, void* workspace, cudaStream_t stream) { +#else + void* const* outputs, void* workspace, + cudaStream_t stream) { +#endif const int* d_segment_offsets_ptr = thrust::raw_pointer_cast(&d_segment_offsets_[0]); float const* input_ptr = reinterpret_cast(inputs[0]); diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 1ee895154d6b046c6c18c2e374d3c63f1fcc5d62..a791395f4a3d3824e4c54ed2cfaf97b79859fde4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -60,7 +60,11 @@ class SplitPlugin : public PluginTensorRTV2Ext { int initialize() override; void terminate() override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + int enqueue(int batch_size, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; void destroy() override { delete this; } diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu index 79ec2066faa130e191ab34f58a030b607172c218..21e80339b500628edebc3964cfc397d0984442a6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu @@ -16,7 +16,6 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu index 3847d999446e99dfe0bcdc7abfa06ac6c57e64e2..da9d21acd5d63f30fe9a3ac6e0ec7d37dfe4c03d 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu @@ -17,18 +17,12 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { -SwishPlugin *CreateSwishPluginDeserialize(const void *buffer, size_t length) { - return new SwishPlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("swish_plugin", CreateSwishPluginDeserialize); - int SwishPlugin::initialize() { return 0; } nvinfer1::Dims SwishPlugin::getOutputDimensions(int index, @@ -85,7 +79,12 @@ __global__ void swish_kernel(int num, const half *input, half *output, } int SwishPlugin::enqueue(int batch_size, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) void **outputs, void *workspace, cudaStream_t stream) { +#else + void *const *outputs, void *workspace, + cudaStream_t stream) { +#endif // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h index 11579aadcc45731123770352ef08b362ff3ef745..8940fdce3b0b56fe6a02478841adb6bcaa79cf83 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h @@ -30,22 +30,16 @@ class SwishPlugin : public PluginTensorRT { private: float beta_; - protected: - size_t getSerializationSize() override { - return SerializedSize(getPluginType()) + getBaseSerializationSize() + - SerializedSize(beta_); + public: + size_t getSerializationSize() const override { + return getBaseSerializationSize() + SerializedSize(beta_); } - // TRT will call this func when we need to serialize the configuration of - // tensorrt. - // It should not be called by users. - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, beta_); } - public: explicit SwishPlugin(const float beta, const bool with_fp16) : beta_(beta) { with_fp16_ = with_fp16; } @@ -56,7 +50,9 @@ class SwishPlugin : public PluginTensorRT { deserializeBase(serialData, serialLength); DeserializeValue(&serialData, &serialLength, &beta_); } + ~SwishPlugin() {} + int initialize() override; SwishPlugin* clone() const override { @@ -67,10 +63,28 @@ class SwishPlugin : public PluginTensorRT { int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, int nbInputDims) override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; }; +class SwishPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const override { return "swish_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new SwishPlugin(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(SwishPluginCreator); + #if IS_TRT_VERSION_GE(6000) class SwishPluginDynamic : public DynamicPluginTensorRT { public: @@ -86,7 +100,7 @@ class SwishPluginDynamic : public DynamicPluginTensorRT { return new SwishPluginDynamic(beta_, with_fp16_); } - const char* getPluginType() const override { return "swish_plugin"; } + const char* getPluginType() const override { return "swish_plugin_dynamic"; } int getNbOutputs() const override { return 1; } int initialize() override; @@ -127,44 +141,18 @@ class SwishPluginDynamic : public DynamicPluginTensorRT { float beta_; }; -class SwishPluginDynamicCreator : public nvinfer1::IPluginCreator { +class SwishPluginDynamicCreator : public TensorRTPluginCreator { public: - SwishPluginDynamicCreator() {} - const char* getPluginName() const override { return "swish_plugin"; } + const char* getPluginName() const override { return "swish_plugin_dynamic"; } const char* getPluginVersion() const override { return "1"; } - const nvinfer1::PluginFieldCollection* getFieldNames() override { - return &field_collection_; - } - - nvinfer1::IPluginV2* createPlugin( - const char* name, const nvinfer1::PluginFieldCollection* fc) override { - return nullptr; - } - nvinfer1::IPluginV2* deserializePlugin(const char* name, const void* serial_data, size_t serial_length) override { - auto plugin = new SwishPluginDynamic(serial_data, serial_length); - return plugin; + return new SwishPluginDynamic(serial_data, serial_length); } - - void setPluginNamespace(const char* lib_namespace) override { - plugin_namespace_ = lib_namespace; - } - - const char* getPluginNamespace() const override { - return plugin_namespace_.c_str(); - } - - private: - std::string plugin_namespace_; - std::string plugin_name_; - nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; - std::vector plugin_attributes_; }; - REGISTER_TRT_PLUGIN_V2(SwishPluginDynamicCreator); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc index 6636513a555f9e638e1dfdb54986010c76785e2a..46f585e6557460c850b6419049b4dbf31d592509 100644 --- a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc @@ -33,7 +33,7 @@ TEST(split_op_plugin, test_plugin) { input_dims.push_back(in_dims); sp_plugin.configurePlugin(input_dims.data(), 1, nullptr, 2, input_types.data(), nullptr, nullptr, nullptr, - nvinfer1::PluginFormat::kNCHW, 4); + nvinfer1::PluginFormat::kLINEAR, 4); sp_plugin.initialize(); sp_plugin.getPluginType(); sp_plugin.canBroadcastInputAcrossBatch(0); diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc index 55bc786746beafcf7b2df98d54e9391e6a59ba24..5be0ed4a13b2309ffc15135176c5962a70d4793a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -21,10 +21,9 @@ namespace plugin { inline void Seria(void*& buffer, // NOLINT const std::vector& input_dims, - size_t max_batch_size, nvinfer1::DataType data_type, + nvinfer1::DataType data_type, nvinfer1::PluginFormat data_format, bool with_fp16) { SerializeValue(&buffer, input_dims); - SerializeValue(&buffer, max_batch_size); SerializeValue(&buffer, data_type); SerializeValue(&buffer, data_format); SerializeValue(&buffer, with_fp16); @@ -32,43 +31,39 @@ inline void Seria(void*& buffer, // NOLINT inline void Deseria(void const*& serial_data, size_t& serial_length, // NOLINT std::vector* input_dims, - size_t* max_batch_size, nvinfer1::DataType* data_type, + nvinfer1::DataType* data_type, nvinfer1::PluginFormat* data_format, bool* with_fp16) { DeserializeValue(&serial_data, &serial_length, input_dims); - DeserializeValue(&serial_data, &serial_length, max_batch_size); DeserializeValue(&serial_data, &serial_length, data_type); DeserializeValue(&serial_data, &serial_length, data_format); DeserializeValue(&serial_data, &serial_length, with_fp16); } inline size_t SeriaSize(const std::vector& input_dims, - size_t max_batch_size, nvinfer1::DataType data_type, + nvinfer1::DataType data_type, nvinfer1::PluginFormat data_format, bool with_fp16) { - return (SerializedSize(input_dims) + SerializedSize(max_batch_size) + - SerializedSize(data_type) + SerializedSize(data_format) + - SerializedSize(with_fp16)); + return (SerializedSize(input_dims) + SerializedSize(data_type) + + SerializedSize(data_format) + SerializedSize(with_fp16)); } -void PluginTensorRT::serializeBase(void*& buffer) { - Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_, - with_fp16_); +void PluginTensorRT::serializeBase(void*& buffer) const { + Seria(buffer, input_dims_, data_type_, data_format_, with_fp16_); } void PluginTensorRT::deserializeBase(void const*& serial_data, size_t& serial_length) { - Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_, - &data_type_, &data_format_, &with_fp16_); + Deseria(serial_data, serial_length, &input_dims_, &data_type_, &data_format_, + &with_fp16_); } -size_t PluginTensorRT::getBaseSerializationSize() { - return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_, - with_fp16_); +size_t PluginTensorRT::getBaseSerializationSize() const { + return SeriaSize(input_dims_, data_type_, data_format_, with_fp16_); } bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const { return ((type == nvinfer1::DataType::kFLOAT) && - (format == nvinfer1::PluginFormat::kNCHW)); + (format == nvinfer1::PluginFormat::kLINEAR)); } void PluginTensorRT::configureWithFormat( @@ -78,23 +73,20 @@ void PluginTensorRT::configureWithFormat( data_type_ = type; data_format_ = format; input_dims_.assign(input_dims, input_dims + num_inputs); - max_batch_size_ = max_batch_size; } void PluginTensorRTV2Ext::serializeBase(void*& buffer) const { - Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_, - with_fp16_); + Seria(buffer, input_dims_, data_type_, data_format_, with_fp16_); } void PluginTensorRTV2Ext::deserializeBase(void const*& serial_data, size_t& serial_length) { - Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_, - &data_type_, &data_format_, &with_fp16_); + Deseria(serial_data, serial_length, &input_dims_, &data_type_, &data_format_, + &with_fp16_); } size_t PluginTensorRTV2Ext::getBaseSerializationSize() const { - return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_, - with_fp16_); + return SeriaSize(input_dims_, data_type_, data_format_, with_fp16_); } void PluginTensorRTV2Ext::configurePlugin( @@ -105,11 +97,27 @@ void PluginTensorRTV2Ext::configurePlugin( const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, int32_t max_batch_size) { input_dims_.assign(input_dims, input_dims + nb_inputs); - max_batch_size_ = max_batch_size; data_format_ = float_format; data_type_ = input_types[0]; } +const nvinfer1::PluginFieldCollection* TensorRTPluginCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2* TensorRTPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + return nullptr; +} + +void TensorRTPluginCreator::setPluginNamespace(const char* lib_namespace) { + plugin_namespace_ = lib_namespace; +} + +const char* TensorRTPluginCreator::getPluginNamespace() const { + return plugin_namespace_.c_str(); +} + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index ce3133ae99e94c62c0c8e958065700373d270037..599294392799dcd44dbad7ab4c9b7d9753dc2684 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -45,79 +45,98 @@ typedef std::function typedef std::function PluginConstructFunc; // Deprecated. Do not inherit this class, please refer to PluginTensorRTV2Ext -class PluginTensorRT : public nvinfer1::IPluginExt { +class PluginTensorRT : public nvinfer1::IPluginV2 { public: PluginTensorRT() : with_fp16_(false) {} + // It was used for TensorRT deserialization. // It should not be called by users. PluginTensorRT(const void* serialized_data, size_t length) {} + virtual ~PluginTensorRT() {} nvinfer1::Dims const& getInputDims(int index) const { return input_dims_.at(index); } - size_t getMaxBatchSize() const { return max_batch_size_; } + nvinfer1::DataType getDataType() const { return data_type_; } - nvinfer1::PluginFormat getDataFormat() const { return data_format_; } - virtual const char* getPluginVersion() const { return "1"; } - void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); } - std::vector& GetInputs() { return inputs_; } + nvinfer1::PluginFormat getDataFormat() const { return data_format_; } - virtual nvinfer1::IPluginExt* clone() const = 0; + // IPluginV2 virtual const char* getPluginType() const = 0; - // Following functions are inherit from nvinfer1::IPluginExt - // Get the number of outputs from the layer + virtual const char* getPluginVersion() const { return "1"; } + int getNbOutputs() const { return 1; } - // Get the dimension of an output tensor + virtual nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* input_dims, int num_inputs) = 0; - // Find the workspace size required by the layer - size_t getWorkspaceSize(int) const override { return 0; } + + // Check format support. The default is FLOAT32 and kLINEAR. + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const override; + + // Configure the layer + void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs, + const nvinfer1::Dims* output_dims, int num_outputs, + nvinfer1::DataType type, + nvinfer1::PluginFormat format, + int max_batch_size) override; // Initialize the layer for execution. - // This is called when the engine is created. int initialize() override { return 0; } + // Shutdown the layer. This is called when the engine is destroyed void terminate() override {} - // Execute the layer + + // Find the workspace size required by the layer + size_t getWorkspaceSize(int) const override { return 0; } + +// Execute the layer +#if IS_TRT_VERSION_LT(8000) virtual int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + virtual int enqueue(int batch_size, const void* const* inputs, + void* const* outputs, +#endif void* workspace, cudaStream_t stream) = 0; // Find the size of the serialization buffer required - virtual size_t getSerializationSize() = 0; + virtual size_t getSerializationSize() const = 0; + // Serialize the layer config to buffer. // TensorRT will call this func to serialize the configuration of TensorRT // engine. It should not be called by users. - virtual void serialize(void* buffer) = 0; + virtual void serialize(void* buffer) const = 0; - // Check format support. The default is FLOAT32 and NCHW. - bool supportsFormat(nvinfer1::DataType type, - nvinfer1::PluginFormat format) const override; - // Configure the layer - void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs, - const nvinfer1::Dims* output_dims, int num_outputs, - nvinfer1::DataType type, - nvinfer1::PluginFormat format, - int max_batch_size) override; + void destroy() override { delete this; } + + virtual nvinfer1::IPluginV2* clone() const = 0; + + void setPluginNamespace(const char* plugin_namespace) override { + namespace_ = plugin_namespace; + } + + const char* getPluginNamespace() const override { return namespace_.c_str(); } protected: // Deserialize input_dims, max_batch_size, data_type, data_format void deserializeBase(void const*& serial_data, // NOLINT size_t& serial_length); // NOLINT - size_t getBaseSerializationSize(); + size_t getBaseSerializationSize() const; // Serialize input_dims, max_batch_size, data_type, data_format - void serializeBase(void*& buffer); // NOLINT + void serializeBase(void*& buffer) const; // NOLINT std::vector input_dims_; - size_t max_batch_size_; nvinfer1::DataType data_type_; nvinfer1::PluginFormat data_format_; - std::vector inputs_; bool with_fp16_; + + private: + std::string namespace_; }; // TensorRT introduced IPluginV2Ext after 5.1, Paddle no longer supports @@ -130,7 +149,6 @@ class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext { nvinfer1::Dims const& getInputDims(int index) const { return input_dims_.at(index); } - size_t getMaxBatchSize() const { return max_batch_size_; } nvinfer1::DataType getDataType() const { return data_type_; } nvinfer1::PluginFormat getDataFormat() const { return data_format_; } @@ -176,7 +194,7 @@ class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext { bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const override { return ((type == nvinfer1::DataType::kFLOAT) && - (format == nvinfer1::PluginFormat::kNCHW)); + (format == nvinfer1::PluginFormat::kLINEAR)); } // Initialize the layer for execution. // This is called when the engine is created. @@ -188,8 +206,13 @@ class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext { // Find the workspace size required by the layer size_t getWorkspaceSize(int) const override { return 0; } - // Execute the layer +// Execute the layer +#if IS_TRT_VERSION_LT(8000) virtual int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + virtual int enqueue(int batch_size, const void* const* inputs, + void* const* outputs, +#endif void* workspace, cudaStream_t stream) = 0; // Find the size of the serialization buffer required @@ -218,10 +241,8 @@ class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext { protected: std::vector input_dims_; - size_t max_batch_size_; nvinfer1::DataType data_type_; nvinfer1::PluginFormat data_format_; - std::vector inputs_; bool with_fp16_; private: @@ -295,6 +316,34 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { }; #endif +class TensorRTPluginCreator : public nvinfer1::IPluginCreator { + public: + TensorRTPluginCreator() = default; + + virtual const char* getPluginName() const = 0; + + virtual const char* getPluginVersion() const = 0; + + const nvinfer1::PluginFieldCollection* getFieldNames() override; + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + + virtual nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) = 0; + + void setPluginNamespace(const char* lib_namespace) override; + + const char* getPluginNamespace() const override; + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + template class TrtPluginRegistrarV2 { public: diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc deleted file mode 100644 index dd4e06ee2a900bb3285b463cd948b158845c506c..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace inference { -namespace tensorrt { -namespace plugin { - -PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, - const void* serial_data, - size_t serial_length) { - const char* plugin_type; - DeserializeValue(&serial_data, &serial_length, &plugin_type); - - PADDLE_ENFORCE_EQ( - Has(plugin_type), true, - platform::errors::NotFound("TensorRT plugin type `%s` does not exists.", - plugin_type)); - auto plugin = plugin_registry_[plugin_type](serial_data, serial_length); - owned_plugins_.emplace_back(plugin); - - return plugin; -} - -bool PluginFactoryTensorRT::RegisterPlugin( - const std::string& op_name, PluginDeserializeFunc deserialize_func) { - if (Has(op_name)) return false; - auto ret = plugin_registry_.emplace(op_name, deserialize_func); - return ret.second; -} - -void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } - -} // namespace plugin -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h deleted file mode 100644 index 076dfbcf8f095ff15a265239c7b267db952b14be..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" -#include "paddle/fluid/inference/utils/singleton.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/variant.h" - -namespace paddle { -namespace inference { -namespace tensorrt { -namespace plugin { - -class PluginFactoryTensorRT : public nvinfer1::IPluginFactory, - public DeleteHelper { - public: - // Deserialization method - PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, - size_t serial_length) override; - - bool RegisterPlugin(const std::string& op_name, - PluginDeserializeFunc deserialize_func); - - bool Has(const std::string& op_name) { - return plugin_registry_.find(op_name) != plugin_registry_.end(); - } - - void DestroyPlugins(); - - protected: - std::unordered_map plugin_registry_; - - std::list> owned_plugins_; -}; - -class TrtPluginRegistrar { - public: - TrtPluginRegistrar(const std::string& name, - PluginDeserializeFunc deserialize_func) { - inference::Singleton::Global().RegisterPlugin( - name, deserialize_func); - } -}; - -#define REGISTER_TRT_PLUGIN(name, deserialize_func) \ - REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func) - -#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \ - static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \ - trt_plugin_registrar##ctr UNUSED = \ - paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \ - name, deserialize_func) - -} // namespace plugin -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index 13d07e774036a48b0ed6e3c91b168eaab4461df5..fe292dba4673f68d7c55e1afb7a965ce77430125 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -17,7 +17,6 @@ #include #include -#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" #include "paddle/fluid/operators/detection/yolo_box_op.h" @@ -243,7 +242,11 @@ int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs, } int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs, +#if IS_TRT_VERSION_LT(8000) void** outputs, void* workspace, +#else + void* const* outputs, void* workspace, +#endif cudaStream_t stream) { if (data_type_ == nvinfer1::DataType::kFLOAT) { return enqueue_impl(batch_size, inputs, outputs, workspace, stream); @@ -295,7 +298,7 @@ const char* YoloBoxPlugin::getPluginNamespace() const { nvinfer1::DataType YoloBoxPlugin::getOutputDataType( int index, const nvinfer1::DataType* input_type, int nb_inputs) const { - return data_type_; + return input_type[0]; } bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index, diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h index 8ca21da7ae0377164cbb50c502f0abb5ca943058..4cd6a383336e236251b9cbef49c96b18a8fe0537 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h @@ -43,7 +43,11 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { bool supportsFormat(nvinfer1::DataType type, nvinfer1::TensorFormat format) const override; size_t getWorkspaceSize(int max_batch_size) const override; +#if IS_TRT_VERSION_LT(8000) int enqueue(int batch_size, const void* const* inputs, void** outputs, +#else + int enqueue(int batch_size, const void* const* inputs, void* const* outputs, +#endif void* workspace, cudaStream_t stream) override; template int enqueue_impl(int batch_size, const void* const* inputs, void** outputs, diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 7c763858bb2101543af1dce0f3b81e964257a696..c627075bfe95d929c83f4b66836ccc9af1ca06d1 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -68,7 +68,7 @@ TEST_F(TensorRTEngineTest, add_layer) { TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size); auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, - nvinfer1::DimsCHW{1, 1, 1}); + nvinfer1::Dims3{1, 1, 1}); auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, weight.get(), bias.get()); PADDLE_ENFORCE_NOT_NULL(fc_layer, @@ -91,6 +91,15 @@ TEST_F(TensorRTEngineTest, add_layer) { buffers[0] = reinterpret_cast(x_v_gpu_data); buffers[1] = reinterpret_cast(y_gpu_data); + LOG(INFO) << "Set attr"; + engine_->Set("test_attr", new std::string("test_attr")); + if (engine_->Has("test_attr")) { + auto attr_val = engine_->Get("test_attr"); + engine_->Erase("test_attr"); + } + std::string *attr_key = new std::string("attr_key"); + engine_->SetNotOwned("attr1", attr_key); + LOG(INFO) << "to execute"; engine_->Execute(1, &buffers, ctx_->stream()); @@ -99,6 +108,8 @@ TEST_F(TensorRTEngineTest, add_layer) { LOG(INFO) << "to checkout output"; ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); + + delete attr_key; } TEST_F(TensorRTEngineTest, add_layer_multi_dim) { @@ -112,7 +123,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2); auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, - nvinfer1::DimsCHW{1, 2, 1}); + nvinfer1::Dims3{1, 2, 1}); auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, weight.get(), bias.get()); PADDLE_ENFORCE_NOT_NULL(fc_layer, diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc index 5f8ddcc94235f39d38e648311a9c233d6063df6c..36a25e27d78f5b6406fcc0d908018dd81d010a5f 100644 --- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -80,7 +80,7 @@ nvinfer1::IHostMemory* CreateNetwork() { nvinfer1::INetworkDefinition* network = builder->createNetwork(); // Add the input auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT, - nvinfer1::DimsCHW{1, 1, 1}); + nvinfer1::Dims3{1, 1, 1}); EXPECT_NE(input, nullptr); // Add the hidden layer. auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get()); diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index f74cd671d6dca0cd52bb595f6ee1370b464d9e30..f0eb0d1fa675b7e88aae44acd79e425a2bc70e47 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -8,44 +8,84 @@ if(WITH_GPU AND TENSORRT_FOUND) set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps}) endif() -function(download_data install_dir data_file) +function(download_data install_dir data_file check_sum) string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) if (NOT EXISTS ${install_dir}/${file_name}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file} ${check_sum}) endif() endfunction() -function(download_int8_data install_dir data_file) +function(download_data_without_verify install_dir data_file) + string(REGEX MATCH "[^/\\]+$" file_name ${data_file}) + if (NOT EXISTS ${install_dir}/${file_name}) + inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL} ${data_file}) + endif() +endfunction() + +function(download_int8_data install_dir data_file check_sum) if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file} ${check_sum}) endif() endfunction() -function(download_bfloat16_data install_dir data_file) +function(download_int8_data_without_verify install_dir data_file) if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file}) + inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8 ${data_file}) endif() endfunction() -function(download_GRU_data install_dir data_file) +function(download_bfloat16_data install_dir data_file check_sum) if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file} ${check_sum}) endif() endfunction() -function(download_quant_data install_dir data_file) +function(download_bfloat16_data_without_verify install_dir data_file) if (NOT EXISTS ${install_dir}/${data_file}) - inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file}) + inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/bfloat16 ${data_file}) endif() endfunction() -function(download_model_and_data install_dir model_name data_name) - download_data(${install_dir} ${model_name}) - download_data(${install_dir} ${data_name}) +function(download_GRU_data install_dir data_file check_sum) + if (NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/gru ${data_file} ${check_sum}) + endif() endfunction() -function(download_result install_dir result_name) - download_data(${install_dir} ${result_name}) +function(download_GRU_data_without_verify install_dir data_file) + if (NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/gru ${data_file}) + endif() +endfunction() + +function(download_quant_data install_dir data_file check_sum) + if (NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file} ${check_sum}) + endif() +endfunction() + +function(download_quant_data_without_verify install_dir data_file) + if (NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress_without_verify(${install_dir} ${INFERENCE_URL}/int8/QAT_models ${data_file}) + endif() +endfunction() + +function(download_model_and_data install_dir model_name model_check_sum data_name data_check_sum) + download_data(${install_dir} ${model_name} ${model_check_sum}) + download_data(${install_dir} ${data_name} ${data_check_sum}) +endfunction() + +function(download_model_and_data_without_verify install_dir model_name data_name) + download_data_without_verify(${install_dir} ${model_name}) + download_data_without_verify(${install_dir} ${data_name}) +endfunction() + +function(download_result install_dir result_name check_sum) + download_data(${install_dir} ${result_name} ${check_sum}) +endfunction() + +function(download_result_without_verify install_dir result_name) + download_data_without_verify(${install_dir} ${result_name}) endfunction() function(inference_analysis_api_test target install_dir filename) @@ -165,18 +205,18 @@ endfunction() if(NOT APPLE AND WITH_MKLML) # RNN1 set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") - download_model_and_data(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz") + download_model_and_data_without_verify(${RNN1_INSTALL_DIR} "rnn1/model.tar.gz" "rnn1/data.txt.tar.gz") inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) # seq_pool1 set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") - download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") + download_model_and_data_without_verify(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_pool1_compare_determine ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_determine_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_compare_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1_fuse_compare_zero_copy ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_compare_zero_copy_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1_fuse_statis ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_fuse_statis_tester.cc) inference_analysis_api_test(test_analyzer_seq_pool1_profile ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_profile_tester.cc) - if(NOT WIN32) + if(NOT WIN32 AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") set_tests_properties(test_analyzer_seq_pool1_compare_determine PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_seq_pool1 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_seq_pool1_fuse_compare_zero_copy PROPERTIES TIMEOUT 120) @@ -193,7 +233,7 @@ endif() # RNN2 set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") -download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") +download_model_and_data_without_verify(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) # TODO(luotao, Superjom) Disable DAM test, temporarily fix @@ -201,12 +241,12 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 # After inference framework refactor, will reopen it. # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") -download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") +download_model_and_data_without_verify(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") #inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") -download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") +download_model_and_data_without_verify(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt) @@ -216,52 +256,52 @@ inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} an # chinese_ner set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") -download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") +download_model_and_data_without_verify(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc) # lac set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") -download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") +download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" 419ca6eb85f57a01bfe173591910aec5 "lac_data.txt.tar.gz" 9983539cd6b34fbdc411e43422776bfd) inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc) # Pyramid DNN set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn") -download_model_and_data(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz") +download_model_and_data_without_verify(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc) #Ernie set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie") -download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" "Ernie_data.txt.tar.gz" "Ernie_result.txt.tar.gz") -download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz") +download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1) +download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62) inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc) #Ernie large set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie_Large") -download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" "Ernie_large_data.txt.tar.gz" "Ernie_large_result.txt.tar.gz") -download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz") +download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_large_model.tar.gz" af7715245ed32cc77374625d4c80f7ef "Ernie_large_data.txt.tar.gz" edb2113eec93783cad56ed76d47ba57f) +download_result(${ERNIE_INSTALL_DIR} "Ernie_large_result.txt.tar.gz" 1facda98eef1085dc9d435ebf3f23a73) inference_analysis_test(test_analyzer_ernie_large SRCS analyzer_ernie_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${ERNIE_INSTALL_DIR}/model --infer_data=${ERNIE_INSTALL_DIR}/data.txt --refer_result=${ERNIE_INSTALL_DIR}/result.txt --ernie_large=true) -if(NOT WIN32 AND NOT APPLE) +if(NOT WIN32 AND NOT APPLE AND TEST test_analyzer_ernie_large) set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 150 LABELS "RUN_TYPE=NIGHTLY") endif() -if (WIN32) +if (WIN32 AND TEST test_analyzer_ernie_large) set_tests_properties(test_analyzer_ernie_large PROPERTIES TIMEOUT 200) endif() # text_classification set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") -download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" 3f0f440313ca50e26184e65ffd5809ab "text_classification_data.txt.tar.gz" 36ae620020cc3377f45ed330dd36238f) inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) # seq_conv1 set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") -download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") +download_model_and_data_without_verify(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) # transformer, the dataset only works on batch_size=8 now set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer") -download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz") +download_model_and_data_without_verify(${TRANSFORMER_INSTALL_DIR} "temp/transformer_model.tar.gz" "temp/transformer_data.txt.tar.gz") inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_compare_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 @@ -278,23 +318,22 @@ inference_analysis_test(test_analyzer_transformer_profile SRCS analyzer_transfor # ocr set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}/ocr.tar.gz) - inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz") + inference_download_and_uncompress_without_verify(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/ocr.tar.gz") endif() inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) # densebox set(DENSEBOX_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/densebox") -download_data(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz") -#inference_analysis_test(test_analyzer_detect SRCS analyzer_detect_tester.cc -# EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} -# ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt -# --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt) -#set_property(TEST test_analyzer_detect PROPERTY ENVIRONMENT GLOG_vmodule=analysis_predictor=2) +download_data_without_verify(${DENSEBOX_INSTALL_DIR} "densebox.tar.gz") +inference_analysis_test(test_analyzer_detect_functional_mkldnn SRCS analyzer_detect_functional_mkldnn_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${DENSEBOX_INSTALL_DIR}/model --infer_data=${DENSEBOX_INSTALL_DIR}/detect_input_50.txt + --infer_shape=${DENSEBOX_INSTALL_DIR}/shape_50.txt) # mobilenet with transpose op set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") if (NOT EXISTS ${MOBILENET_INSTALL_DIR}/mobilenet.tar.gz) - inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz") + inference_download_and_uncompress_without_verify(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos/mobilenet.tar.gz") endif() inference_analysis_api_test(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) @@ -307,13 +346,13 @@ inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} ${IMG_CLA # googlenet set(GOOGLENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/googlenet") -download_data(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz") +download_data_without_verify(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz") inference_analysis_api_test_with_fake_data_run(test_analyzer_googlenet ${IMG_CLASS_TEST_APP} ${GOOGLENET_MODEL_DIR} false) # resnet50 set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") -download_data(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz") +download_data_without_verify(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz") inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP} ${RESNET50_MODEL_DIR} true) if (WIN32) @@ -323,7 +362,7 @@ endif() # mobilenet with depthwise_conv op set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv") -download_data(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz") +download_data_without_verify(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz") inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP} ${MOBILENET_MODEL_DIR} false) @@ -340,7 +379,7 @@ if(WITH_MKLDNN) set(IMAGENET_DATA_ARCHIVE "imagenet_val_100_tail.tar.gz") set(IMAGENET_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/imagenet") set(IMAGENET_DATA_PATH "${IMAGENET_DATA_DIR}/data.bin") - download_int8_data(${IMAGENET_DATA_DIR} ${IMAGENET_DATA_ARCHIVE}) + download_int8_data_without_verify(${IMAGENET_DATA_DIR} ${IMAGENET_DATA_ARCHIVE}) # build test binary to be used in subsequent tests set(INT8_IMG_CLASS_TEST_APP "test_analyzer_int8_image_classification") @@ -349,40 +388,40 @@ if(WITH_MKLDNN) # resnet50 int8 set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50") - download_int8_data(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" ) inference_analysis_api_int8_test_run(test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH}) # mobilenetv1 int8 set(INT8_MOBILENETV1_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1") - download_int8_data(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" ) inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH}) # mobilenetv2 int8 set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2") - download_int8_data(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" ) inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) # resnet101 int8 # TODO(grygielski) Enable after MKL-DNN 1.0 merge set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101") - download_int8_data(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" ) # inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH}) # vgg16 int8 # TODO(grygielski) Enable after MKL-DNN 1.0 merge set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16") - download_int8_data(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" ) # inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) # vgg19 int8 # TODO(grygielski) Enable after MKL-DNN 1.0 merge set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19") - download_int8_data(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" ) # inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH}) # googlenet int8 set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet") - download_int8_data(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" ) inference_analysis_api_int8_test_run_custom_warmup_batch_size(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH} 10) ### BFLOAT16 tests @@ -410,7 +449,7 @@ if(WITH_MKLDNN) set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc") # download dataset if necessary - download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz") + download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz") # build test binary to be used in subsequent tests @@ -418,13 +457,13 @@ if(WITH_MKLDNN) # mobilenet-ssd int8 set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd") - download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" ) + download_int8_data_without_verify(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" ) inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH}) ### Lexcial analysis GRU model set(GRU_PATH "${INFERENCE_DEMO_INSTALL_DIR}/gru") - download_GRU_data("${GRU_PATH}" "GRU_eval_data.tar.gz") - download_GRU_data("${GRU_PATH}" "GRU_eval_model_v2.tar.gz") + download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_data.tar.gz") + download_GRU_data_without_verify("${GRU_PATH}" "GRU_eval_model_v2.tar.gz") set(GRU_DATA_PATH "${GRU_PATH}/GRU_eval_data.bin") set(GRU_MODEL_PATH "${GRU_PATH}/GRU_eval_model_v2") set(LEXICAL_TEST_APP "test_analyzer_lexical_analysis") @@ -451,9 +490,9 @@ if(WITH_MKLDNN) set(QUANT2_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2") set(QUANT2_INT8_MobileNetV1_MODEL_DIR "${QUANT_DATA_DIR}/MobileNetV1_quant2_int8") if(NOT LINUX) - download_quant_data(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz") + download_quant_data_without_verify(${QUANT2_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf.tar.gz") endif(NOT LINUX) - download_quant_data(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz") + download_quant_data_without_verify(${QUANT2_INT8_MobileNetV1_MODEL_DIR} "MobileNet_qat_perf_int8.tar.gz") inference_analysis_api_quant_test_run(test_analyzer_quant_performance_benchmark ${QUANT_IMG_CLASS_TEST_APP} ${QUANT2_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf/float ${QUANT2_INT8_MobileNetV1_MODEL_DIR}/MobileNet_qat_perf_int8 ${IMAGENET_DATA_PATH}) ### Other tests @@ -465,13 +504,13 @@ if(WITH_MKLDNN) inference_analysis_test_run(test_mkldnn_quantizer_config COMMAND ${MKLDNN_QUANTIZER_CONFIG_TEST_APP}) # preprocess data2bin imagenet - download_int8_data(${INT8_DATA_DIR} "imagenet_small.tar.gz") + download_int8_data_without_verify(${INT8_DATA_DIR} "imagenet_small.tar.gz") set(IMAGENET_SMALL_DATA_DIR "${INT8_DATA_DIR}/imagenet_small") set(IMAGENET_SMALL_OUTPUT_FILE "imagenet_small.bin") preprocess_data2bin_test_run(preprocess_local_imagenet "full_ILSVRC2012_val_preprocess.py" ${IMAGENET_SMALL_DATA_DIR} ${IMAGENET_SMALL_OUTPUT_FILE}) # preprocess data2bin pascalvoc - download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz") + download_int8_data_without_verify(${INT8_DATA_DIR} "pascalvoc_small.tar.gz") set(PASCALVOC_SMALL_DATA_DIR "${INT8_DATA_DIR}/pascalvoc_small") set(PASCALVOC_SMALL_OUTPUT_FILE "pascalvoc_small.bin") preprocess_data2bin_test_run(preprocess_local_pascalvoc "full_pascalvoc_test_preprocess.py" ${PASCALVOC_SMALL_DATA_DIR} ${PASCALVOC_SMALL_OUTPUT_FILE}) @@ -480,26 +519,26 @@ endif() # bert, max_len=20, embedding_dim=128 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128") -download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") +download_model_and_data_without_verify(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) # multiple models prediction set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction") -download_data(${MMP_INSTALL_DIR} PaddleInference/mobilenet_v2_models.tar.gz) +download_data_without_verify(${MMP_INSTALL_DIR} PaddleInference/mobilenet_v2_models.tar.gz) inference_multiple_models_analysis_api_test(test_analyzer_multi_model_prediction ${MMP_INSTALL_DIR} analyzer_mmp_tester.cc) if(WITH_GPU AND TENSORRT_FOUND) set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models") if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models.tar.gz) - inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz") + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz" 3dcccdc38b549b6b1b4089723757bd98) endif() set(TEST_SPLIT_CONVERTER_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_split_op_converter_test") if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz) - inference_download_and_uncompress(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz") + inference_download_and_uncompress_without_verify(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz") endif() set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test") if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz) - inference_download_and_uncompress(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz") + inference_download_and_uncompress_without_verify(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz") endif() inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} @@ -531,7 +570,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) - inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") + inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") endif() inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} @@ -539,7 +578,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware") if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz) - inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz") + inference_download_and_uncompress_without_verify(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz") endif() inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} @@ -547,12 +586,12 @@ if(WITH_GPU AND TENSORRT_FOUND) set(TEST_TRT_DYNAMIC_MODEL2 "${TRT_MODEL_INSTALL_DIR}/complex_model_dynamic") if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL2}/complex_model_dynamic2.tar.gz) - inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz") + inference_download_and_uncompress_without_verify(${TEST_TRT_DYNAMIC_MODEL2} ${INFERENCE_URL}/tensorrt_test "complex_model_dynamic2.tar.gz") endif() set(TEST_TRT_DYNAMIC_MODEL "${TRT_MODEL_INSTALL_DIR}/conv_bn_swish_split_gelu") if (NOT EXISTS ${TEST_TRT_DYNAMIC_MODEL}/conv_bn_swish_split_gelu.tar.gz) - inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz") + inference_download_and_uncompress(${TEST_TRT_DYNAMIC_MODEL} ${INFERENCE_URL}/tensorrt_test "conv_bn_swish_split_gelu.tar.gz" 2a5e8791e47b221b4f782151d76da9c6) endif() inference_analysis_test(trt_dynamic_shape_test SRCS trt_dynamic_shape_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} @@ -560,7 +599,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set(TEST_TRT_ERNIE_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test") if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4.tar.gz) - inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz") + inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4.tar.gz" 5fa371efa75706becbaad79195d2ca68) endif() inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc @@ -569,7 +608,7 @@ if(WITH_GPU AND TENSORRT_FOUND) set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune") if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz) - inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz") + inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz" 77b56dc73ff0cf44ddb1ce9ca0b0f471) endif() inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc @@ -577,7 +616,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune) if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz) - inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") + inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz" 833d73fc6a7f7e1ee4a1fd6419209e55) endif() inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_serialize_deserialize_test.cc @@ -585,7 +624,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz) - inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz") + inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz" c5ff2d0cad79953ffbf2b8b9e2fae6e4) endif() inference_analysis_test(test_trt_dynamic_shape_ernie_fp16_ser_deser SRCS trt_dynamic_shape_ernie_fp16_serialize_deserialize_test.cc @@ -595,7 +634,7 @@ if(WITH_GPU AND TENSORRT_FOUND) endif() set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") -download_data(${LITE_MODEL_INSTALL_DIR} "mul_model_fp32.tgz") +download_data_without_verify(${LITE_MODEL_INSTALL_DIR} "mul_model_fp32.tgz") inference_analysis_test(lite_mul_model_test SRCS lite_mul_model_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} @@ -645,6 +684,10 @@ if(WITH_GPU) ARGS --infer_model=${RESNET50_MODEL_DIR}) endif() +if("$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON") + return() +endif() + if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 300) set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 300) diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..384bef8a4b439d8543127d5e7a1110525f06d282 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_detect_functional_mkldnn_tester.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +DEFINE_string(infer_shape, "", "data shape file"); +DEFINE_int32(sample, 20, "number of sample"); + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line, const std::string &shape_line) { + VLOG(3) << "process a line"; + + Record record; + std::vector data_strs; + split(line, ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(shape_line, ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + return record; +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + // cfg->SwitchIrDebug(); // Enable to have graphs dumped + cfg->SwitchSpecifyInputNames(false); + cfg->SetCpuMathLibraryNumThreads(FLAGS_cpu_num_threads); +} + +void SetInput(std::vector> *inputs, + const std::string &line, const std::string &shape_line) { + auto record = ProcessALine(line, shape_line); + + PaddleTensor input; + input.shape = record.shape; + input.dtype = PaddleDType::FLOAT32; + size_t input_size = record.data.size() * sizeof(float); + input.data.Resize(input_size); + memcpy(input.data.data(), record.data.data(), input_size); + std::vector input_slots; + input_slots.assign({input}); + (*inputs).emplace_back(input_slots); +} + +#ifdef PADDLE_WITH_MKLDNN +int GetNumCachedObjects(void) { + auto &pool = platform::DeviceContextPool::Instance(); + platform::CPUPlace place; + auto onednn_dev_ctx = + dynamic_cast(pool.Get(place)); + return onednn_dev_ctx->GetCachedObjectsNumber(); +} + +void validate_cache_onednn(int cache_capacity = 1) { + AnalysisConfig cfg; + SetConfig(&cfg); + cfg.EnableMKLDNN(); + cfg.SetMkldnnCacheCapacity(cache_capacity); + + auto predictor = CreatePaddlePredictor(cfg); + std::vector> ref_outputs; + std::vector> input_slots_all; + + std::ifstream file(FLAGS_infer_data); + std::ifstream infer_file(FLAGS_infer_shape); + std::vector lines; + std::vector shape_lines; + + // Let's work with 4 samples + auto num_samples = 4; + ref_outputs.resize(num_samples); + lines.resize(num_samples); + shape_lines.resize(num_samples); + + // Let's remember number of cached objects before + // execution and after every single execution + std::vector cache_filling; + cache_filling.push_back(GetNumCachedObjects()); + + // compute sequentially prediction + for (int i = 0; i < num_samples; ++i) { + std::getline(file, lines[i]); + std::getline(infer_file, shape_lines[i]); + SetInput(&input_slots_all, lines[i], shape_lines[i]); + predictor->Run(input_slots_all[i], &ref_outputs[i], FLAGS_batch_size); + // record number of cached objects + cache_filling.push_back(GetNumCachedObjects()); + } + + file.close(); + infer_file.close(); + + // Pick first output tensor from model + // as internally reorders may be called + // so it will impact cache size + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputTensor(output_names[0]); + std::vector output_shape = output_t->shape(); + size_t out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + std::vector out_data; + out_data.resize(out_num); + output_t->CopyToCpu(out_data.data()); + + // Release predictor (relevant cache should be emptied) + predictor.reset(nullptr); + cache_filling.push_back(GetNumCachedObjects()); + + // Compare results + // First and last value should be equal e.g. before using cache (empty) and + // after releasing executor + PADDLE_ENFORCE_EQ( + cache_filling[0], cache_filling[cache_filling.size() - 1], + platform::errors::Fatal("Cache size before execution and after " + "releasing Executor do not match")); + + // Iterate to check if cache is not increasing + // over exceeding cache capacity + if (cache_capacity != 0) { + for (int i = cache_capacity + 1; i < num_samples + 1; ++i) { + PADDLE_ENFORCE_EQ( + cache_filling[cache_capacity], cache_filling[i], + platform::errors::Fatal("Cache capacity should not increase " + "after full capacity is used")); + } + } +} + +TEST(Analyzer_detect, validate_cache_onednn) { + validate_cache_onednn(2 /*cache_capacity */); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc index 024313837e0b63a4ff2325b9cedd75a608c2a879..720c90090cf746121ee79b44bd3c9ab35b736dba 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc @@ -38,7 +38,6 @@ void SetAnalysisConfig(AnalysisConfig *cfg, cfg->SwitchSpecifyInputNames(false); cfg->SetCpuMathLibraryNumThreads(num_threads); cfg->EnableMKLDNN(); - cfg->pass_builder()->AppendPass("mkldnn_placement_pass"); } std::vector ReadSentenceLod(std::ifstream &file, size_t offset, diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py index e911c94208711e3cd6929a68024c8957a5aae334..adb6aa4d75344d767ce44019f3c1162956087210 100644 --- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py +++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py @@ -167,7 +167,7 @@ def run_convert(): os.path.getsize(output_file) == FULL_SIZE_BYTES): if os.path.exists(output_file): sys.stderr.write( - "\n\nThe existing binary file is broken. Start to generate new one...\n\n". + "\n\nThe existing binary file[{}] is broken. Start to generate new one...\n\n". format(output_file)) os.remove(output_file) if retry < try_limit: diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 170b915ec7436727333f6de5bae68fe1d1f6300b..dbc2acbed8367a949857bb56fb83fd592bffaa3f 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -33,6 +33,7 @@ #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc index 6d69565716ee7a36ac090347859a3729e509836c..e449fb5096e6e068ef49866407010ad9b4658892 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc @@ -16,61 +16,66 @@ limitations under the License. */ #include #include "gflags/gflags.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { namespace inference { -void run(const AnalysisConfig& config, std::vector* out_data) { +void run(const AnalysisConfig& config, std::vector* out_data, int bs) { auto predictor = CreatePaddlePredictor(config); auto input_names = predictor->GetInputNames(); - int run_batch = 1; + int run_batch = bs; const int run_seq_len = 128; + size_t len = run_batch * run_seq_len; - std::vector tmp_input; - std::vector tmp_four_input; - tmp_input.reserve(run_batch * run_seq_len); - tmp_four_input.reserve(run_batch * run_seq_len); - - int64_t i0[run_seq_len] = { + int64_t i0_bs1[run_seq_len] = { 1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4, 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44, 486, 218, 1140, 279, 12043, 2}; - int64_t i1[run_seq_len] = { + int64_t i1_bs1[run_seq_len] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - int64_t i2[run_seq_len] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, - 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, - 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, - 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}; - float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + int64_t i2_bs1[run_seq_len] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}; + float i3_bs1[run_seq_len] = { + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector i0_data(len), i1_data(len), i2_data(len); + std::vector i3_data(len); + for (size_t i = 0; i < len; i++) { + i0_data[i] = i0_bs1[i % run_seq_len]; + i1_data[i] = i1_bs1[i % run_seq_len]; + i2_data[i] = i2_bs1[i % run_seq_len]; + i3_data[i] = i3_bs1[i % run_seq_len]; + } // first input auto input_t = predictor->GetInputTensor(input_names[0]); input_t->Reshape({run_batch, run_seq_len, 1}); - input_t->copy_from_cpu(i0); + input_t->copy_from_cpu(i0_data.data()); // second input auto input_t2 = predictor->GetInputTensor(input_names[1]); input_t2->Reshape({run_batch, run_seq_len, 1}); - input_t2->copy_from_cpu(i1); + input_t2->copy_from_cpu(i1_data.data()); // third input. auto input_t3 = predictor->GetInputTensor(input_names[2]); input_t3->Reshape({run_batch, run_seq_len, 1}); - input_t3->copy_from_cpu(i2); + input_t3->copy_from_cpu(i2_data.data()); auto input_t4 = predictor->GetInputTensor(input_names[3]); input_t4->Reshape({run_batch, run_seq_len, 1}); - input_t4->copy_from_cpu(i3); + input_t4->copy_from_cpu(i3_data.data()); ASSERT_TRUE(predictor->ZeroCopyRun()); @@ -83,8 +88,8 @@ void run(const AnalysisConfig& config, std::vector* out_data) { output_t->copy_to_cpu(out_data->data()); } -void trt_ernie(bool with_fp16, std::vector result, - float near_tolerance) { +void trt_ernie(bool with_fp16, std::vector result, float near_tolerance, + int batch_size = 1) { AnalysisConfig config; std::string model_dir = FLAGS_infer_model; SetConfig(&config, model_dir, true); @@ -124,7 +129,7 @@ void trt_ernie(bool with_fp16, std::vector result, config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); std::vector out_data; - run(config, &out_data); + run(config, &out_data, batch_size); for (size_t i = 0; i < out_data.size(); i++) { EXPECT_NEAR(result[i], out_data[i], near_tolerance); @@ -143,5 +148,149 @@ TEST(AnalysisPredictor, fp16) { #endif } +TEST(AnalysisPredictor, no_fp16_bs2) { + std::vector result = {0.597841, 0.219972, 0.182187, + 0.597841, 0.219972, 0.182187}; + trt_ernie(false, result, 1e-5, 2); +} + +TEST(AnalysisPredictor, fp16_bs2) { +#ifdef TRT_PLUGIN_FP16_AVALIABLE + std::vector result = {0.598, 0.219, 0.182, 0.598, 0.219, 0.182}; + trt_ernie(true, result, 4e-3, 2); +#endif +} + +// ernie_varlen +std::shared_ptr InitPredictor() { + paddle_infer::Config config; + config.SetModel(FLAGS_infer_model); + + config.EnableUseGpu(100, 0); + + // Open the memory optim. + config.EnableMemoryOptim(); + + int max_batch = 32; + int max_single_seq_len = 128; + int opt_single_seq_len = 64; + int min_batch_seq_len = 1; + int max_batch_seq_len = 512; + int opt_batch_seq_len = 256; + + std::string input_name0 = "read_file_0.tmp_0"; + std::string input_name1 = "read_file_0.tmp_1"; + std::string input_name2 = "read_file_0.tmp_2"; + std::string input_name3 = "read_file_0.tmp_4"; + + std::vector min_shape = {min_batch_seq_len}; + std::vector max_shape = {max_batch_seq_len}; + std::vector opt_shape = {opt_batch_seq_len}; + // Set the input's min, max, opt shape + std::map> min_input_shape = { + {input_name0, min_shape}, + {input_name1, min_shape}, + {input_name2, {1}}, + {input_name3, {1, 1, 1}}}; + std::map> max_input_shape = { + {input_name0, max_shape}, + {input_name1, max_shape}, + {input_name2, {max_batch + 1}}, + {input_name3, {1, max_single_seq_len, 1}}}; + std::map> opt_input_shape = { + {input_name0, opt_shape}, + {input_name1, opt_shape}, + {input_name2, {max_batch + 1}}, + {input_name3, {1, opt_single_seq_len, 1}}}; + + // only kHalf supported + config.EnableTensorRtEngine( + 1 << 30, 1, 5, paddle_infer::Config::Precision::kHalf, false, false); + // erinie varlen must be used with dynamic shape + config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, + opt_input_shape); + // erinie varlen must be used with oss + config.EnableTensorRtOSS(); + + return paddle_infer::CreatePredictor(config); +} + +void run(paddle_infer::Predictor* predictor, std::vector* out_data) { + const int run_batch = 2; + const int run_seq_len = 71; + const int max_seq_len = 128; + + int32_t i1[run_seq_len] = { + // sentence 1 + 1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4, + 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44, + 486, 218, 1140, 279, 12043, 2, + // sentence 2 + 101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029, + 102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996, + 2117, 3072, 2234, 2046, 2486, 1012, 102, + }; + int32_t i2[run_seq_len] = { + // sentence 1 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // sentence 2 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1}; + // shape info of this batch + int32_t i3[3] = {0, 40, 71}; + // max_seq_len represents the max sentence length of all the sentences, only + // length of + // input i4 is useful, data means nothing. + int32_t i4[max_seq_len] = {0}; + + auto input_names = predictor->GetInputNames(); + // first input + auto input_t1 = predictor->GetInputHandle(input_names[0]); + input_t1->Reshape({run_seq_len}); + input_t1->CopyFromCpu(i1); + + // second input + auto input_t2 = predictor->GetInputHandle(input_names[1]); + input_t2->Reshape({run_seq_len}); + input_t2->CopyFromCpu(i2); + + // third input + auto input_t3 = predictor->GetInputHandle(input_names[2]); + input_t3->Reshape({run_batch + 1}); + input_t3->CopyFromCpu(i3); + + // fourth input + auto input_t4 = predictor->GetInputHandle(input_names[3]); + input_t4->Reshape({1, max_seq_len, 1}); + input_t4->CopyFromCpu(i4); + + CHECK(predictor->Run()); + + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data->resize(out_num); + output_t->CopyToCpu(out_data->data()); + + return; +} + +TEST(AnalysisPredictor, ernie_varlen) { +#if IS_TRT_VERSION_GE(7234) + auto predictor = InitPredictor(); + std::vector out_data; + run(predictor.get(), &out_data); + std::vector ref_data{0.59814, 0.219882, 0.181978, + 0.359796, 0.577414, 0.0627908}; + float near_tolerance = 1e-3; + for (size_t i = 0; i < out_data.size(); i++) { + EXPECT_NEAR(ref_data[i], out_data[i], near_tolerance); + } +#endif +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index 41b78d39a2594cbe39bc0d0defef7a24047674dc..05c468b798886ac135ed30bff75ce9400f1ca3a1 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -23,7 +23,30 @@ function(inference_download INSTALL_DIR URL FILENAME) ) endfunction() -function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) +function(inference_download_and_uncompress INSTALL_DIR URL FILENAME CHECK_SUM) + message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") + string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME}) + string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME}) + set(EXTERNAL_PROJECT_NAME "extern_download_${FILENAME_EX}") + set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}") + ExternalProject_Add( + ${EXTERNAL_PROJECT_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + URL ${URL}/${FILENAME} + URL_HASH MD5=${CHECK_SUM} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_EXTRACT 1 + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND ${CMAKE_COMMAND} -E chdir ${INSTALL_DIR} + ${CMAKE_COMMAND} -E tar xzf ${DOWNLOAD_NAME} + UPDATE_COMMAND "" + INSTALL_COMMAND "" + ) +endfunction() + +function(inference_download_and_uncompress_without_verify INSTALL_DIR URL FILENAME) message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") string(REGEX REPLACE "[-%./\\]" "_" FILENAME_EX ${FILENAME}) string(REGEX MATCH "[^/\\]+$" DOWNLOAD_NAME ${FILENAME}) @@ -47,13 +70,13 @@ endfunction() set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz) - inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") + inference_download_and_uncompress_without_verify(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") endif() set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet") if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz) - inference_download_and_uncompress(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz") + inference_download_and_uncompress_without_verify(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz") endif() set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model") diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 2ea047fa13c10596995916234ef67e8a276b6b22..9a0637453f03f08a50bb1af958b1ba5e584869b4 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -29,6 +29,7 @@ endif() if (WITH_ASCEND_CL) cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) + cc_library(npu_pinned_allocator SRCS npu_pinned_allocator.cc DEPS allocator npu_info) endif() cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) @@ -73,10 +74,15 @@ endif() list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator) +if (WITH_ASCEND_CL) + list(APPEND AllocatorFacadeDeps npu_pinned_allocator) +endif() + + cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy ) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) if (WITH_TESTING) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 730efa5c646885026eee1e472205ce723b0fcb1b..3a156f1fa3c4cfb39d8dd3524353fd0c6a616184 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -20,6 +20,9 @@ #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -72,6 +75,7 @@ class AllocatorFacadePrivate { for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); } + InitNaiveBestFitNPUPinnedAllocator(); #endif break; } @@ -195,6 +199,12 @@ class AllocatorFacadePrivate { void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { allocators_[p] = std::make_shared(p); } + + void InitNaiveBestFitNPUPinnedAllocator() { + allocators_[platform::NPUPinnedPlace()] = + std::make_shared(); + } + #endif class ZeroSizeAllocator : public Allocator { @@ -294,6 +304,11 @@ uint64_t AllocatorFacade::Release(const platform::Place& place) { ->Release(place); } +const std::shared_ptr& AllocatorFacade::GetAllocator( + const platform::Place& place) { + return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index fa906fbf5ce8fedb7790e19a1e7c257bbce5faac..7f6ad561aa931bd42fe312fe397cc561a64f723f 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -15,11 +15,17 @@ #pragma once #include #include "paddle/fluid/memory/allocation/allocator.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif #include "paddle/fluid/platform/place.h" namespace paddle { namespace memory { namespace allocation { +#ifdef PADDLE_WITH_ASCEND_CL +using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator; +#endif // Allocator Facade is the interface exposed to other modules. // All the configuration or dirty code under development should @@ -46,6 +52,7 @@ class AllocatorFacade { // Release unused memory pool. uint64_t Release(const platform::Place& place); + const std::shared_ptr& GetAllocator(const platform::Place& place); // TODO(yy): Allocate a Copy-On-Write allocation? private: diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 3e88d61783c9e67053ef065f61fef5cf991a9b25..bc72b4b20d061445932d877417f02917dfd613cf 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -287,6 +287,21 @@ class NPUBuddyAllocatorList { BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { return NPUBuddyAllocatorList::Instance()->Get(npu_id); } + +BuddyAllocator *GetNPUPinnedBuddyAllocator() { + static std::once_flag init_flag; + static BuddyAllocator *ba = nullptr; + + std::call_once(init_flag, []() { + ba = new BuddyAllocator(std::unique_ptr( + new detail::NPUPinnedAllocator), + platform::NPUPinnedMinChunkSize(), + platform::NPUPinnedMaxChunkSize()); + }); + + return ba; +} + #endif template <> @@ -351,6 +366,59 @@ uint64_t Release(const platform::NPUPlace &place) { #endif } +template <> +size_t Used(const platform::NPUPinnedPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUPinnedBuddyAllocator()->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void *Alloc(const platform::NPUPinnedPlace &place, + size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + auto *buddy_allocator = GetNPUPinnedBuddyAllocator(); + void *ptr = buddy_allocator->Alloc(size); + + if (ptr == nullptr) { + LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size + << " bytes in NPUPinnedPlace"; + } + if (FLAGS_init_allocated_mem) { + memset(ptr, 0xEF, size); + } + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::NPUPinnedPlace &place, + void *p, size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + GetNPUPinnedBuddyAllocator()->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release( + const platform::NPUPinnedPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUPinnedBuddyAllocator()->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPinnedPlace' is not supported in CPU only device.")); +#endif +} + // For CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUBuddyAllocatorList { diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc new file mode 100644 index 0000000000000000000000000000000000000000..507a8589d94ddd1adf925aa5e01c787439624c62 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +void NPUPinnedAllocator::ProcessEventsAndFree() { + for (auto it = npu_events_.begin(); it != npu_events_.end();) { + aclrtEvent event = it->second; + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status)); + + if (status == ACL_EVENT_STATUS_COMPLETE) { + Allocation *allocation = it->first; + void *ptr = allocation->ptr(); + free(ptr); + npu_events_.erase(it++); + delete allocation; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + } else { + ++it; + } + } +} + +Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { + ProcessEventsAndFree(); + void *ptr; + int error = posix_memalign(&ptr, kAlignment, size); + PADDLE_ENFORCE_EQ( + error, 0, + platform::errors::ResourceExhausted( + "Fail to alloc memory of %ld size, error code is %d.", size, error)); + return new Allocation(ptr, size, platform::NPUPinnedPlace()); +} + +void NPUPinnedAllocator::FreeImpl(Allocation *allocation) { + void *ptr = allocation->ptr(); + auto iter = npu_events_.find(allocation); + aclrtEvent event = iter->second; + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status)); + if (status == ACL_EVENT_STATUS_COMPLETE) { + free(ptr); + npu_events_.erase(allocation); + delete allocation; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + } + return; +} + +uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) { + return static_cast(0); +} + +void NPUPinnedAllocator::RecordEvent(Allocation *allocation, + aclrtStream stream) { + aclrtEvent event = nullptr; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event)); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream)); + npu_events_.insert({allocation, event}); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle +#endif diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..4c856b931ee2cf5b5734d90636b4bfd3dad138da --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h @@ -0,0 +1,51 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL +#include // NOLINT +#include +#include + +#include "acl/acl.h" +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class NPUPinnedAllocator : public Allocator { + public: + bool IsAllocThreadSafe() const override { return true; } + void ProcessEventsAndFree(); + void RecordEvent(Allocation *allocation, aclrtStream stream); + constexpr static size_t kAlignment = 4096UL; + + protected: + Allocation *AllocateImpl(size_t size) override; + void FreeImpl(Allocation *allocation) override; + uint64_t ReleaseImpl(const platform::Place &place) override; + + private: + std::unordered_map npu_events_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle + +#endif diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 0d7065d8bfba0e4ba6f443a3f9e87ee0e1a825a6..9f39c3a823f862caab36f4312c2011e3ada38703 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -192,7 +192,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void* p; // PINNED memory is visible to all CUDA contexts. #ifdef PADDLE_WITH_HIP - hipError_t result = hipHostMalloc(&p, size); + hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable); #else cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); #endif @@ -310,6 +310,60 @@ void NPUAllocator::Free(void* p, size_t size, size_t index) { } bool NPUAllocator::UseGpu() const { return true; } + +void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + size_t usable = + paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_; + + if (size > usable) { + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB pinned memory." + << ", available " << usable / 1024.0 / 1024.0 << " MB"; + return nullptr; + } + + void* p; + // PINNED memory is visible to all NPU contexts. + auto result = aclrtMallocHost(&p, size); + + if (result == ACL_ERROR_NONE) { + *index = 1; // PINNED memory + npu_pinnd_alloc_size_ += size; + return p; + } else { + LOG(WARNING) << "aclrtMallocHost failed."; + return nullptr; + } + + return nullptr; +} + +void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { + aclError err; + PADDLE_ENFORCE_EQ(index, 1, platform::errors::InvalidArgument( + "The index should be 1, but got %d", index)); + + PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated npu pinned memory (%d)", + size, npu_pinnd_alloc_size_)); + npu_pinnd_alloc_size_ -= size; + err = aclrtFreeHost(p); + + if (err != ACL_ERROR_NONE) { + PADDLE_ENFORCE_EQ( + err, 0, + platform::errors::Fatal( + "aclrtFreeHost failed in NPUPinnedAllocator, error code is %d", + err)); + } +} + +bool NPUPinnedAllocator::UseGpu() const { return false; } + #endif } // namespace detail diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index 26711ae4070f5ed72f77519b196c4c354cb049e1..92042f0bbae9f0d29d15b9ed266f57cfa7594412 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -80,6 +80,16 @@ class NPUAllocator : public SystemAllocator { size_t npu_alloc_size_ = 0; int npu_id_; }; + +class NPUPinnedAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t npu_pinnd_alloc_size_ = 0; +}; #endif } // namespace detail diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 730d49e8acd93022e6e46f7285b9548ed7a5c6d8..f2f8c5d1fb5551b4d41cb8d283a2f6b65e493269 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -30,6 +30,7 @@ void Copy(platform::CPUPlace, void* dst, platform::CPUPlace, const void* src, size_t num) { if (UNLIKELY(num == 0)) return; + VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; std::memcpy(dst, src, num); } @@ -245,7 +246,7 @@ void Copy(platform::CPUPlace dst_place, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); static_cast(pool.Get(src_place))->Wait(); - platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); + platform::RecordEvent record_event("NpuMemcpySync:NPU->CPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } } @@ -294,6 +295,86 @@ void Copy(platform::NPUPlace dst_place, } } } + +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, + const void* src, size_t num) { + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place; + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num) { + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place; + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::NPUPinnedPlace dst_place, void* dst, + platform::NPUPinnedPlace src_place, const void* src, size_t num) { + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place; + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, + const void* src, size_t num, aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(src_place.device); + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + + platform::RecordEvent record_event("NpuMemcpySync:NPU->NPUPinned"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); + } +} + +template <> +void Copy( + platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, + const void* src, size_t num, aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(dst_place.device); + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + + platform::RecordEvent record_event("NpuMemcpySync:NPUPinned->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); + } +} + #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 6e11c64afc4bd813362640e151203d4dd700fea5..0956410041bb23558fec5ad3c628590649e01624 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -7,8 +7,6 @@ set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h.tmp CACHE INTE set(pybind_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists.txt. DO NOT EDIT!\n\n") -copy_if_different(${pybind_file} ${pybind_file_final}) - add_subdirectory(math) add_subdirectory(eigen) add_subdirectory(controlflow) @@ -20,6 +18,9 @@ add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) add_subdirectory(jit) +if(WITH_MKLDNN) + add_subdirectory(mkldnn) +endif() if(WITH_DISTRIBUTE) @@ -115,9 +116,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_function) if (WITH_GPU OR WITH_ROCM) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer) @@ -171,7 +172,7 @@ endif() if (WITH_ASCEND_CL) cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) - cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op) + cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_function scope device_context enforce executor compare_op) endif() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") @@ -203,3 +204,5 @@ endif() if (WITH_GPU OR WITH_ASCEND_CL) cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor) endif() + +copy_if_different(${pybind_file} ${pybind_file_final}) diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc index 5c431ce77dc76ae08c70cd54989f323a230d47f7..796425a132b0003ae055569c23b107bd80987f9f 100644 --- a/paddle/fluid/operators/abs_op.cc +++ b/paddle/fluid/operators/abs_op.cc @@ -164,9 +164,9 @@ REGISTER_OP_CPU_KERNEL( ops::AbsKernel, ops::AbsKernel, ops::AbsKernel, + paddle::platform::complex>, ops::AbsKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( abs_grad, ops::AbsGradKernel, @@ -174,9 +174,9 @@ REGISTER_OP_CPU_KERNEL( ops::AbsGradKernel, ops::AbsGradKernel, ops::AbsGradKernel, + paddle::platform::complex>, ops::AbsGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( abs_grad_grad, @@ -187,6 +187,6 @@ REGISTER_OP_CPU_KERNEL( ops::AbsDoubleGradKernel, ops::AbsDoubleGradKernel, + paddle::platform::complex>, ops::AbsDoubleGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/abs_op.cu b/paddle/fluid/operators/abs_op.cu index e373d628f6cbd6b5ee48edc984a68d2767ce0593..b0eba229fde51841542b5d8d1d73330b40bd29f0 100644 --- a/paddle/fluid/operators/abs_op.cu +++ b/paddle/fluid/operators/abs_op.cu @@ -13,44 +13,78 @@ // limitations under the License. #include "paddle/fluid/operators/abs_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { + +template +struct CudaAbsFunctor; + +template +struct CudaAbsFunctor>> { + __device__ __forceinline__ math::Real operator()(const T* args) const { + return abs(args[0]); + } +}; + +template +struct CudaAbsFunctor>> { + __device__ __forceinline__ T operator()(const T* args) const { + return std::abs(args[0]); + } +}; + +template +class AbsKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + Tensor* out = context.Output("Out"); + out->mutable_data>(context.GetPlace()); + + auto& dev_ctx = + context.template device_context(); + std::vector ins = {x}; + std::vector outs = {out}; + auto functor = CudaAbsFunctor(); + LaunchSameDimsElementwiseCudaKernel>(dev_ctx, ins, &outs, + functor); + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; +namespace plat = paddle::platform; + REGISTER_OP_CUDA_KERNEL( - abs, ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel, - ops::AbsKernel); + abs, ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel, + ops::AbsKernel>, + ops::AbsKernel>); REGISTER_OP_CUDA_KERNEL( - abs_grad, ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel, - ops::AbsGradKernel); + abs_grad, ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel, + ops::AbsGradKernel>, + ops::AbsGradKernel>); REGISTER_OP_CUDA_KERNEL( - abs_grad_grad, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel, - ops::AbsDoubleGradKernel); + abs_grad_grad, ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel, + ops::AbsDoubleGradKernel>, + ops::AbsDoubleGradKernel>); diff --git a/paddle/fluid/operators/abs_op_npu.cc b/paddle/fluid/operators/abs_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..7bfe35ef6e02145714209452fadd9182b58659e7 --- /dev/null +++ b/paddle/fluid/operators/abs_op_npu.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/abs_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class AbsNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("Abs", + { + *x, + }, + {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class AbsGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + dx->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("AbsGrad", {*x, *dout}, {*dx}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + abs, ops::AbsNPUKernel, + ops::AbsNPUKernel); + +REGISTER_OP_NPU_KERNEL( + abs_grad, ops::AbsGradNPUKernel, + ops::AbsGradNPUKernel); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 055909ba6f486ff82220c2d36c54687091bde9ed..4a12ceb13ab29f1220ae13f4990b85d396df2eca 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -182,6 +182,13 @@ $$out = e^x$$ )DOC"; +UNUSED constexpr char Expm1Doc[] = R"DOC( +Expm1 Operator. Computes expm1 of x element-wise with a natural number :math:`e` as the base. + +$$out = e^x - 1$$ + +)DOC"; + UNUSED constexpr char ReluDoc[] = R"DOC( Relu Activation Operator. @@ -706,6 +713,7 @@ REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc); REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); +REGISTER_ACTIVATION_OP_MAKER(Expm1, Expm1Doc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); @@ -789,6 +797,27 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; +template +class SigmoidDoubleGradMaker + : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("sigmoid_grad_grad"); + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + op->SetAttrMap(this->Attrs()); + // output: ddy + op->SetOutput("DOutNew", this->InputGrad("Out")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + template class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { public: @@ -1068,6 +1097,47 @@ namespace plat = paddle::platform; FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); +/* ========================== sigmoid register ============================= + */ +// 1. Register Sigmoid Operator +REGISTER_OPERATOR( + sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, + ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + std::conditional>(), + ops::ActFwdInplaceInferer, void>::type); + +// 2. Register Sigmoid Grad Operator +REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::SigmoidDoubleGradMaker, + ops::SigmoidDoubleGradMaker) + +// 3. Register Sigmoid DoubleGrad Operator +REGISTER_OPERATOR( + sigmoid_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +// Register Sigmoid/GradSigmoid Kernels +REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, + SigmoidGradFunctor); + +// Register DoubleGrad Kernel +REGISTER_OP_CPU_KERNEL( + sigmoid_grad_grad, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>); + +/* ========================================================================== */ + /* ========================== tanh register ============================= */ REGISTER_OPERATOR( tanh, ops::ActivationOp, ops::TanhOpMaker, ops::ActivationOpInferVarType, @@ -1346,6 +1416,34 @@ REGISTER_OP_CPU_KERNEL( ops::ExpGradFunctor>); /* ========================================================================== */ +/* ========================== expm1 register ============================ */ +REGISTER_OPERATOR( + expm1, ops::ActivationOp, ops::Expm1OpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + std::conditional>(), + ops::ActFwdInplaceInferer, void>::type); +REGISTER_OPERATOR(expm1_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer); + +REGISTER_OP_CPU_KERNEL(expm1, + ops::ActivationKernel>, + ops::ActivationKernel>, + ops::ActivationKernel>); +REGISTER_OP_CPU_KERNEL( + expm1_grad, ops::ActivationGradKernel>, + ops::ActivationGradKernel>, + ops::ActivationGradKernel>); +/* ========================================================================== */ + /* ========================== Log register ==================================*/ REGISTER_OPERATOR( log, ops::ActivationOp, ops::LogOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 781a97c1ffcc17f40a288351fee031a18000122e..6c02450479141b2de670b09b0e0346161d5a7128 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,382 +10,1378 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/math/math_cuda_utils.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using float16 = paddle::platform::float16; +template +struct CudaReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // relu(x) = max(x, 0) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > zero ? args[0] : zero; + } +}; + +template +struct CudaReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // dx = dout * (out > 0) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > zero ? args[0] : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; template -struct CudaVecType { - using type = T; - static constexpr int vecsize = 1; +struct CudaLeakyReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // leakyrelu(x) = x > 0 ? x : alpha * x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > zero ? args[0] : static_cast(alpha) * args[0]; + } }; -template <> -struct CudaVecType { - using type = __half2; - static constexpr int vecsize = 2; +template +struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout * (x > 0 ? 1 : alpha) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > zero ? args[0] : static_cast(alpha) * args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; -template <> -struct CudaVecType { - using type = float4; - static constexpr int vecsize = 4; +template +struct CudaSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // sigmoid(x) = 1 / (1 + exp(-x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(one / (one + exp(-x))); + } }; template -class BaseGPUFunctor { - public: - using ELEMENT_TYPE = T; +struct CudaSigmoidGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * out * (1 - out) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[1] * (one - args[1]); + } - using AttrPair = std::vector>; + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; - AttrPair GetAttrs() { return AttrPair(); } +template +struct CudaSiluFunctor : public BaseActivationFunctor { + // MPType means Compute Type + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // silu(x) = x / (1 + exp(-x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(x / (one + exp(-x))); + } }; -/* ========================================================================== */ +template +struct CudaSiluGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType temp = one / (one + exp(-x)); + return static_cast(dout * (temp * (one + x * (one - temp)))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; -/* =========================== relu forward ============================ */ template -class ReluGPUFunctor : public BaseGPUFunctor { - private: - T zero_; +struct CudaLogSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // logsigmoid(x) = log(1 / (1 + exp(-x))) + // For numerical stability, + // logsigmoid(x) = + // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType temp = x > zero ? zero : -x; + return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); + } +}; - public: - ReluGPUFunctor() { zero_ = static_cast(0.0f); } +template +struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // dx = dout * exp(-x) / (1 + exp(-x)) + // For numerical stability: + // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, + // 0))) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType temp1 = x > zero ? zero : -x; + MPType temp2 = exp(-x - temp1); + return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; - // for relu forward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type in) { - // relu forward : out = max(x, 0) - return in > zero_ ? in : zero_; +template +struct CudaAtanFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // atan(x) = atan(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(atan(x)); } +}; - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T in) { - // relu forward : out = max(x, 0) - return in > zero_ ? in : zero_; +template +struct CudaAtanGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (one + args[1] * args[1]); } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type in) { - // relu forward : out = max(in, 0) - return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y), - (in.z > zero_) * (in.z), (in.w > zero_) * (in.w)); -} +template +struct CudaSoftShrinkFunctor : public BaseActivationFunctor { + float lambda; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type in) { -// relu forward : out = max(in, 0) -#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(in, kzero), in); -#else - const float2 xx = __half22float2(in); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), - (xx.y > 0.0f) * static_cast(xx.y)); -#endif -} -/* ========================================================================== */ + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } -/* =========================== relu backward ============================ - */ + // softshrink(x) = x - lambda, if x > lambda; + // x + lambda, if x < -lambda; + // 0, otherwise. + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T l = static_cast(lambda); + T temp1 = static_cast(x > l); + T temp2 = static_cast(x < -l); + return temp1 * (x - l) + temp2 * (x + l); + } +}; template -class ReluGradGPUFunctor : public BaseGPUFunctor { - private: - T zero_; +struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float lambda; - public: - ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // dx = dout, if x > lambda or x < -lambda else 0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[1]; + T l = static_cast(lambda); + return (x >= -l && x <= l) ? zero : args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaCeilFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // ceil(x) = ceil(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(ceil(x)); + } +}; + +template +struct CudaFloorFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // floor(x) = floor(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(floor(x)); + } +}; + +template +struct CudaRoundFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // round(x) = round(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(round(x)); + } +}; + +// grad functor for ceil, floor and round +template +struct CudaZeroGradFunctor : public BaseActivationFunctor { + __device__ __forceinline__ T operator()(const T* args) const { + return static_cast(0.0f); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } +}; + +template +struct CudaCosFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // cos(x) = cos(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(cos(x)); + } +}; + +template +struct CudaCosGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * (-sin(x)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(-dout * sin(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSinFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sin(x) = sin(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sin(x)); + } +}; + +template +struct CudaSinGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * cos(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * cos(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaTanFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tan(x) = tan(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(tan(x)); + } +}; + +template +struct CudaTanGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout / cos(x)^2 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout / (cos(x) * cos(x))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAsinFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // asin(x) = asin(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(asin(x)); + } +}; + +template +struct CudaAsinGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout / sqrt(1 - x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAcosFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // acos(x) = acos(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(acos(x)); + } +}; + +template +struct CudaAcosGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = -dout / sqrt(1 - x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(-dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaCoshFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // cosh(x) = cosh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(cosh(x)); + } +}; + +template +struct CudaCoshGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * sinh(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * sinh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSinhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sinh(x) = sinh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sinh(x)); + } +}; + +template +struct CudaSinhGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * cosh(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * cosh(x)); + } - // for relu backward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type out, - const typename CudaVecType::type dout) { - return out > zero_ ? dout : zero_; + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaTanhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tanh(x) = tanh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(tanh(x)); } +}; - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { - // relu backward : dx = out > 0 ? dout : 0 - return out > zero_ ? dout : zero_; +template +struct CudaTanhGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * (1 - out^2) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T dout = static_cast(args[0]); + T out = static_cast(args[1]); + return dout * (one - out * out); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type out, - const CudaVecType::type dout) { - // relu backward : dx = out > 0 ? dout : 0; - return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y), - (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type out, - const CudaVecType::type dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(out, kzero), dout); -#else - const float2 xx = __half22float2(out); - const float2 yy = __half22float2(dout); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), - (xx.y > 0.0f) * static_cast(yy.y)); -#endif -} +template +struct CudaReciprocalFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // reciprocal(x) = 1 / x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return one / args[0]; + } +}; -/* ========================================================================== */ -/* ======================== leaky relu forward ======================== - */ template -class LeakyReluGPUFunctor : public BaseGPUFunctor { - private: - T zero_; - float alpha_; +struct CudaReciprocalGradFunctor : public BaseActivationFunctor { + // dx = -dout * out^2 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return -args[0] * args[1] * args[1]; + } - public: - LeakyReluGPUFunctor() { zero_ = static_cast(0.0f); } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaExpFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // exp(x) = exp(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(exp(x)); + } +}; + +template +struct CudaExpGradFunctor : public BaseActivationFunctor { + // dx = dout * out + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaExpm1Functor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // expm1(x) = expm1(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(expm1(x)); + } +}; + +template +struct CudaExpm1GradFunctor : public BaseActivationFunctor { + // dx = dout * out + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[1] + args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaLogFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // log(x) = log(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log(x)); + } +}; + +template +struct CudaLogGradFunctor : public BaseActivationFunctor { + // dx = dout / x + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSquareFunctor : public BaseActivationFunctor { + // square(x) = x * x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[0]; + } +}; + +template +struct CudaSquareGradFunctor : public BaseActivationFunctor { + T two = static_cast(2.0f); + + // dx = dout * 2 * x + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * two * args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSqrtFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sqrt(x) = sqrt(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sqrt(x)); + } +}; + +template +struct CudaSqrtGradFunctor : public BaseActivationFunctor { + T one_half = static_cast(0.5f); + + // dx = dout * 0.5 / out + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return one_half * args[0] / args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaRsqrtFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // rsqrt(x) = rsqrt(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(rsqrt(x)); + } +}; + +template +struct CudaRsqrtGradFunctor : public BaseActivationFunctor { + T minus_one_half = static_cast(-0.5f); + + // dx = dout * -0.5 / out^3 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T out = args[1]; + return minus_one_half * args[0] * out * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaLog1pFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // log1p(x) = log(1 + x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log(one + x)); + } +}; + +template +struct CudaLog1pGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (one + args[1]); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaLog2Functor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // log2(x) = log2(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log2(x)); + } +}; + +template +struct CudaLog2GradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + T log_two = static_cast(log(static_cast(2.0f))); + + // dx = dout / (x * log(2)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (args[1] * log_two); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaLog10Functor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // log10(x) = log10(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log10(x)); + } +}; + +template +struct CudaLog10GradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + T log_ten = static_cast(log(static_cast(10.0f))); + + // dx = dout / (x * log(10)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (args[1] * log_ten); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaBReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha_}}; - } - // leakyrelu forward : out = x > 0 ? x : x * alpha - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type in) { - return in > zero_ ? in : static_cast(alpha_) * in; - } - - __device__ __forceinline__ T ComputeRemainder(const T in) { - // leakyrelu forward : out = x > 0 ? x : x * alpha - return in > zero_ ? in : static_cast(alpha_) * in; - } -}; - -template <> -__device__ __forceinline__ CudaVecType::type -LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { - // leakyrelu forward : out = x > 0 ? x : x * alpha - return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_, - (in.y > zero_) ? (in.y) : (in.y) * alpha_, - (in.z > zero_) ? (in.z) : (in.z) * alpha_, - (in.w > zero_) ? (in.w) : (in.w) * alpha_); -} - -template <> -__device__ __forceinline__ CudaVecType::type -LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { - // leakyrelu forward : out = x > 0 ? x : x * alpha - const float2 xx = __half22float2(in); - return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_, - (xx.y > 0.0f) ? xx.y : xx.y * alpha_); -} -/* ========================================================================== */ + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // brelu(x) = min(max(x, t_min), t_max) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + T temp_max = x > t_min_cast ? x : t_min_cast; + T temp_min = temp_max < t_max_cast ? temp_max : t_max_cast; + return temp_min; + } +}; -/* =========================== leaky relu backward ======================= - */ template -class LeakyReluGradGPUFunctor : public BaseGPUFunctor { - private: - T zero_; - float alpha_; +struct CudaBReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float t_min; + float t_max; - public: - LeakyReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + // dx = (x > t_min && x < t_max) ? dout : 0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T dout = args[0]; + T x = args[1]; + T t_min_cast = static_cast(t_min); + T t_max_cast = static_cast(t_max); + return (x > t_min_cast && x < t_max_cast) ? dout : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSoftReluFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold))) + // Inputs: args[0], the input x + // threshold should not be negative + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType t = static_cast(threshold); + MPType temp_min = x < t ? x : t; + MPType temp_max = temp_min > -t ? temp_min : -t; + return static_cast(log(one + exp(temp_max))); + } +}; + +template +struct CudaSoftReluGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0 + // Inputs: args[0], the input dout + // args[1], the input out + // threshold should not be negative + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType out = static_cast(args[1]); + MPType t = static_cast(threshold); + return (out > -t && out < t) ? static_cast(dout * (one - exp(-out))) + : static_cast(0.0f); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaSTanhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + float scale_a; + float scale_b; typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha_}}; + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + // stanh(x) = b * tanh(a * x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType a = static_cast(scale_a); + MPType b = static_cast(scale_b); + return static_cast(b * tanh(a * x)); } +}; - // for leaky relu backward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type in, - const typename CudaVecType::type dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - return in > zero_ ? dout : static_cast(alpha_) * dout; +template +struct CudaSTanhGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float scale_a; + float scale_b; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; } - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - return in > zero_ ? dout : static_cast(alpha_) * dout; + // dx = dout * a * b * (1 - tanh(a * x) * tanh(a * x)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType a = static_cast(scale_a); + MPType b = static_cast(scale_b); + MPType temp = tanh(a * x); + return static_cast(dout * a * b * (one - temp * temp)); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; -template <> -__device__ __forceinline__ CudaVecType::type -LeakyReluGradGPUFunctor::Compute(const CudaVecType::type in, - const CudaVecType::type dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x), - (in.y > zero_) ? (dout.y) : alpha_ * (dout.y), - (in.z > zero_) ? (dout.z) : alpha_ * (dout.z), - (in.w > zero_) ? (dout.w) : alpha_ * (dout.w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type LeakyReluGradGPUFunctor< - float16>::Compute(const CudaVecType::type in, - const CudaVecType::type dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - const float2 xx = __half22float2(in); - const float2 yy = __half22float2(dout); - return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x, - (xx.y > 0.0f) ? yy.y : alpha_ * yy.y); -} +template +struct CudaSoftplusFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } + + // softplus(x) = beta * x > threshold ? x : log(1 + exp(beta * x)) / beta + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType b = static_cast(beta); + MPType t = static_cast(threshold); + MPType x_beta = x * beta; + return static_cast(x_beta > t ? x : log(one + exp(x_beta)) / b); + } +}; + +template +struct CudaSoftplusGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + float threshold; -/* ========================================================================== */ + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}, {"threshold", &threshold}}; + } -template -__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, - T* dx, int num, Functor functor) { - using VecType = typename CudaVecType::type; - constexpr int vecsize = CudaVecType::vecsize; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in_forward = reinterpret_cast(forward_data); - const VecType* in_dout = reinterpret_cast(dout); - VecType* out = reinterpret_cast(dx); - VecType forward_vec, dout_vec; - T in_data, dout_data; - for (int i = idx; i < loop; i += stride) { -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - forward_vec = __ldg(in_forward + i); - dout_vec = __ldg(in_dout + i); -#else - forward_vec = in_forward[i]; - dout_vec = in_dout[i]; -#endif - out[i] = functor.Compute(forward_vec, dout_vec); - } - - while (idx == loop && tail) { - in_data = forward_data[num - tail]; - dout_data = dout[num - tail]; - dx[num - tail] = functor.ComputeRemainder(in_data, dout_data); - --tail; - } -} - -template -__global__ void ActivationkernelVec(const T* src, T* dst, int num, - Functor functor) { - constexpr int vecsize = CudaVecType::vecsize; - using VecType = typename CudaVecType::type; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in = reinterpret_cast(src); - VecType* out = reinterpret_cast(dst); - VecType x_vec; - for (int i = idx; i < loop; i += stride) { -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - x_vec = __ldg(in + i); -#else - x_vec = in[i]; -#endif - out[i] = functor.Compute(x_vec); + // dx = x * beta > threshold ? dout : dout / (1 + exp(-beta * x)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType b = static_cast(beta); + MPType t = static_cast(threshold); + MPType x_beta = x * beta; + return x_beta > t ? args[0] : static_cast(dout / (one + exp(-x_beta))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSoftsignFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // softsign(x) = x / (1 + abs(x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (one + abs(args[0])); + } +}; + +template +struct CudaSoftsignGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + abs(x))^2 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T temp = one + abs(args[1]); + return args[0] / (temp * temp); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaRelu6Functor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // relu6(x) = min(max(0, x), 6) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T t = static_cast(threshold); + return args[0] <= zero ? zero : (args[0] < t ? args[0] : t); + } +}; + +template +struct CudaRelu6GradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (out > 0 && out < t) ? dout : 0 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T t = static_cast(threshold); + return (args[1] > zero && args[1] < t) ? args[0] : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaTanhShrinkFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tanhshrink(x) = x - tanh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(x - tanh(x)); + } +}; + +template +struct CudaTanhShrinkGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * tanh(x)^2 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * tanh(x) * tanh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaHardShrinkFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // hadrshrink(x) = (x > -threshold && x < threshold) ? 0 : x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : x; + } +}; + +template +struct CudaHardShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = (x > -threshold && x < threshold) ? 0 : dout + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[1]; + T t = static_cast(threshold); + return (x > -t && x < t) ? zero : args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaHardSigmoidFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // hard_sigmoid(x) = 0, when x <= -3 + // 1, when x >= 3 + // x * slope + offset, otherwise + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T temp = args[0] * static_cast(slope) + static_cast(offset); + T temp_max = temp > zero ? temp : zero; + T temp_min = temp_max < one ? temp_max : one; + return temp_min; + } +}; + +template +struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + float slope; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + // dx = (out > 0 && out < 1) ? dout * slope : 0 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T out = args[1]; + return (out > zero && out < one) ? args[0] * static_cast(slope) : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudaSwishFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + // swish(x) = x / (1 + exp(-beta * x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType b = static_cast(beta); + return static_cast(x / (one + exp(-b * x))); + } +}; + +template +struct CudaSwishGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + float beta; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + // dx = dout * (1 + exp(-b * x) + b * x * exp(-b * x) / (1 + exp(-b * x))^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType b = static_cast(beta); + MPType temp1 = one / (one + exp(-b * x)); + MPType out = x * temp1; + MPType temp2 = b * out; + MPType temp3 = temp1 * (one - temp2); + return static_cast(dout * (temp2 + temp3)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaThresholdedReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // thresholded_relu(x) = x > threshold ? x : 0 + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > static_cast(threshold) ? args[0] : zero; + } +}; + +template +struct CudaThresholdedReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + // dx = x > threshold ? dout : 0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > static_cast(threshold) ? args[0] : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaHardSwishFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float threshold; + float scale; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}}; + } + + // hard_swish(x) = 0, when x <= -offset + // x , when x >= threshold - offset + // x * (x + offset) / scale, otherwise + // threshold = scale = 6, offset = 3 by default + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T t = static_cast(threshold); + T temp = x + static_cast(offset); + T temp_max = temp > zero ? temp : zero; + T temp_min = temp_max < t ? temp_max : t; + return temp_min * x / static_cast(scale); + } +}; + +template +struct CudaHardSwishGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + T one = static_cast(1.0f); + T two = static_cast(2.0f); + float threshold; + float scale; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}}; } - while (idx == loop && tail) { - dst[num - tail] = functor.ComputeRemainder(src[num - tail]); - --tail; + // dx = 0, when x <= -offset + // dout , when x >= threshold - offset + // dout * (2 * x / scale + offset / scale), otherwise + // threshold = scale = 6, offset = 3 by default + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[1]; + T o = static_cast(offset); + T s = static_cast(scale); + T temp1 = static_cast(x + o > zero); + T temp2 = static_cast(x + o < static_cast(threshold)); + return args[0] * (temp1 * temp2 * (two * x + o) / s + one - temp2); } -} + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaELUFunctor : public BaseActivationFunctor { + using CT = typename details::MPTypeTrait::Type; + CT zero = static_cast(0.0f); + CT one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // elu(x) = max(0, x) + min(0, alpha * (exp(x) - 1)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + CT x = static_cast(args[0]); + CT temp = static_cast(alpha) * (exp(x) - one); + CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp); + return static_cast(res); + } +}; + +template +struct CudaELUGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + MPType one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout, if alpha > 0 and x > 0 + // dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0 + // dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0 + // dx = 0, if alpha <= 0 and x <=0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType a = static_cast(alpha); + MPType temp_a_pos = static_cast(alpha > 0.0f); + MPType temp_a_neg = static_cast(alpha <= 0.0f); + MPType temp_x_pos = static_cast(x > zero); + MPType temp_x_neg = static_cast(x <= zero); + return static_cast( + dout * (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * a * exp(x) + + temp_a_neg * temp_x_pos * (one + a * exp(x)))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; template -class ActivationGPUKernel +class ActivationCudaKernel : public framework::OpKernel { public: using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = nullptr; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor* x = nullptr; framework::Tensor* out = nullptr; - ExtractActivationTensor(context, &in_x, &out); - auto& dev_ctx = context.template device_context(); - - int num = in_x->numel(); - const T* input_data = in_x->data(); - T* output_data = out->mutable_data(dev_ctx.GetPlace(), - static_cast(num * sizeof(T))); - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - Functor functor; + ExtractActivationTensor(ctx, &x, &out); + out->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + std::vector ins = {x}; + std::vector outs = {out}; + auto functor = Functor(); auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); + *attr.second = ctx.Attr(attr.first); } - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((num / vecsize + block - 1) / block, 1); - auto stream = context.cuda_device_context().stream(); - ActivationkernelVec<<>>( - input_data, output_data, num, functor); + LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); } }; template -class ActivationGradGPUKernel +class ActivationGradCudaKernel : public framework::OpKernel { public: using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext& ctx) const override { const framework::Tensor *x, *out, *d_out; framework::Tensor* d_x = nullptr; x = out = d_out = nullptr; - ExtractActivationGradTensor(context, &x, &out, &d_out, + ExtractActivationGradTensor(ctx, &x, &out, &d_out, &d_x); - int numel = d_out->numel(); - auto& dev_ctx = context.template device_context(); - auto* dx_data = d_x->mutable_data( - dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); - auto* dout_data = d_out->data(); + d_x->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + auto functor = Functor(); + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + + std::vector ins = {d_out}; + std::vector outs = {d_x}; - auto* forward_data = dout_data; if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { // Only need forward output Out - forward_data = out->data(); + ins.push_back(out); + LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); } else if (static_cast(Functor::FwdDeps()) == static_cast(kDepX)) { // Only need forward input X - forward_data = x->data(); + ins.push_back(x); + LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); + } else { + LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, functor); } - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - - Functor functor; - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); - } - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((numel / vecsize + block - 1) / block, 1); - auto stream = context.cuda_device_context().stream(); - ActivationGradKernelVec<<>>( - forward_data, dout_data, dx_data, numel, functor); } }; @@ -395,43 +1391,53 @@ class ActivationGradGPUKernel namespace ops = paddle::operators; namespace plat = paddle::platform; -#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \ - grad_functor) \ - REGISTER_OP_CUDA_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>); -FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL); - -#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ +#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ - act_type, ops::ActivationGPUKernel>, \ - ops::ActivationGPUKernel>, \ - ops::ActivationGPUKernel>); \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); + +#define REGISTER_ACTIVATION_CUDA_KERNEL_INT(act_type, op_name, functor, \ + grad_functor) \ REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, ops::ActivationGradGPUKernel>, \ - ops::ActivationGradGPUKernel>, \ - ops::ActivationGradGPUKernel>); + act_type, ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); /* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor, - LeakyReluGradGPUFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, + CudaLeakyReluGradFunctor); REGISTER_OP_CUDA_KERNEL( leaky_relu_grad_grad, @@ -444,7 +1450,7 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* ======================== elu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, CudaELUFunctor, CudaELUGradFunctor); REGISTER_OP_CUDA_KERNEL( elu_grad_grad, ops::ELUDoubleGradKernel>, ops::ActivationDoubleGradKernel>); +#else +REGISTER_OP_CUDA_KERNEL( + relu, ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>); +REGISTER_OP_CUDA_KERNEL( + relu_grad, ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>); +REGISTER_OP_CUDA_KERNEL( + relu_grad_grad, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>); +#endif +/* ========================================================================== */ + +/* =========================== sigmoid register ============================ + */ +REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, + CudaSigmoidGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + sigmoid_grad_grad, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>, + ops::SigmoidDoubleGradKernel>); /* ========================================================================== */ /* =========================== tanh register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(tanh, Tanh, CudaTanhFunctor, + CudaTanhGradFunctor); REGISTER_OP_CUDA_KERNEL( tanh_grad_grad, @@ -482,7 +1535,8 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== sqrt register ============================= */ -REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, CudaSqrtFunctor, + CudaSqrtGradFunctor); REGISTER_OP_CUDA_KERNEL( sqrt_grad_grad, @@ -496,7 +1550,8 @@ REGISTER_OP_CUDA_KERNEL( /* =========================== rsqrt register ============================= */ -REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(rsqrt, Rsqrt, CudaRsqrtFunctor, + CudaRsqrtGradFunctor); REGISTER_OP_CUDA_KERNEL( rsqrt_grad_grad, @@ -509,25 +1564,8 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== square register ============================ */ -REGISTER_OP_CUDA_KERNEL( - square, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>); -REGISTER_OP_CUDA_KERNEL( - square_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); +REGISTER_ACTIVATION_CUDA_KERNEL_INT(square, Square, CudaSquareFunctor, + CudaSquareGradFunctor); REGISTER_OP_CUDA_KERNEL( square_grad_grad, @@ -544,7 +1582,6 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* ========================== pow register ============================ */ - REGISTER_OP_CUDA_KERNEL( pow, ops::PowKernel>, ops::PowKernel>, @@ -562,29 +1599,48 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* ========================== exp register ============================ */ - REGISTER_OP_CUDA_KERNEL( - exp, ops::ActivationKernel>, - ops::ActivationKernel>, + exp, ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, ops::ActivationKernel>, ops::ActivationKernel>, - ops::ActivationKernel>); + ops::ActivationCudaKernel>); REGISTER_OP_CUDA_KERNEL( - exp_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); + exp_grad, ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>); +/* ========================================================================== */ + +/* ========================== expm1 register ============================ */ + +REGISTER_OP_CUDA_KERNEL( + expm1, ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>); +REGISTER_OP_CUDA_KERNEL( + expm1_grad, ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>); /* ========================================================================== */ /* ========================== Log register ==================================*/ -REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); REGISTER_OP_CUDA_KERNEL( log_grad_grad, ops::LogDoubleGradKernel>); /* ========================================================================== */ + +#define FOR_EACH_ACTIVATION_CUDA_OP(__macro) \ + __macro(silu, Silu, CudaSiluFunctor, CudaSiluGradFunctor); \ + __macro(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, \ + CudaLogSigmoidGradFunctor); \ + __macro(atan, Atan, CudaAtanFunctor, CudaAtanGradFunctor); \ + __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor, \ + CudaSoftShrinkGradFunctor); \ + __macro(ceil, Ceil, CudaCeilFunctor, CudaZeroGradFunctor); \ + __macro(floor, Floor, CudaFloorFunctor, CudaZeroGradFunctor); \ + __macro(cos, Cos, CudaCosFunctor, CudaCosGradFunctor); \ + __macro(tan, Tan, CudaTanFunctor, CudaTanGradFunctor); \ + __macro(acos, Acos, CudaAcosFunctor, CudaAcosGradFunctor); \ + __macro(sin, Sin, CudaSinFunctor, CudaSinGradFunctor); \ + __macro(asin, Asin, CudaAsinFunctor, CudaAsinGradFunctor); \ + __macro(sinh, Sinh, CudaSinhFunctor, CudaSinhGradFunctor); \ + __macro(cosh, Cosh, CudaCoshFunctor, CudaCoshGradFunctor); \ + __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor); \ + __macro(reciprocal, Reciprocal, CudaReciprocalFunctor, \ + CudaReciprocalGradFunctor); \ + __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor); \ + __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor); \ + __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor); \ + __macro(brelu, BRelu, CudaBReluFunctor, CudaBReluGradFunctor); \ + __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \ + __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor); \ + __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor); \ + __macro(softsign, Softsign, CudaSoftsignFunctor, CudaSoftsignGradFunctor); \ + __macro(relu6, Relu6, CudaRelu6Functor, CudaRelu6GradFunctor); \ + __macro(tanh_shrink, TanhShrink, CudaTanhShrinkFunctor, \ + CudaTanhShrinkGradFunctor); \ + __macro(hard_shrink, HardShrink, CudaHardShrinkFunctor, \ + CudaHardShrinkGradFunctor); \ + __macro(hard_sigmoid, HardSigmoid, CudaHardSigmoidFunctor, \ + CudaHardSigmoidGradFunctor); \ + __macro(swish, Swish, CudaSwishFunctor, CudaSwishGradFunctor); \ + __macro(thresholded_relu, ThresholdedRelu, CudaThresholdedReluFunctor, \ + CudaThresholdedReluGradFunctor); \ + __macro(hard_swish, HardSwish, CudaHardSwishFunctor, \ + CudaHardSwishGradFunctor); +FOR_EACH_ACTIVATION_CUDA_OP(REGISTER_ACTIVATION_CUDA_KERNEL) diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 7245dea9cf9499ef310e4e601f41ab3e1e374158..57ea97f746246bf9fcbd434d9a45ac1a1c73d251 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -258,6 +258,43 @@ struct SigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +/* + Out + DOut -> SigmoidGradGrad -> DOutNew + DDX DDOut + + DDOut = (1-Out)*Out*DDX + DOutNew = (1-2*Out)*DOut*DDX +*/ +template +struct SigmoidGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + framework::Tensor* dOutNew, framework::Tensor* ddOut) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidGradGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidGradGrad")); + + if (dOutNew) { + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); + auto dout_new = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + dout_new.device(*d) = + (static_cast(1) - static_cast(2) * out) * dout * ddx; + } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + ddout.device(*d) = (static_cast(1) - out) * out * ddx; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // silu(x) = x / (1 + exp(-x)) template struct SiluFunctor : public BaseActivationFunctor { @@ -341,6 +378,26 @@ struct ExpGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +// expm1(x) = e^x - 1 +template +struct Expm1Functor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.expm1(); + } +}; + +template +struct Expm1GradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out + dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // relu(x) = max(x, 0) template struct ReluCPUFunctor : public BaseActivationFunctor { @@ -455,7 +512,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 + temp2).template cast(); + out.device(d) = x * (temp1 || temp2).template cast(); } }; @@ -472,7 +529,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 + temp2).template cast(); + dx.device(d) = dout * (temp1 || temp2).template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -1789,6 +1846,50 @@ inline void ExtractDoubleGradTensorWithInputDOut( } } +template +class SigmoidDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut; + framework::Tensor *dOutNew, *ddOut; + Out = ddX = dOut = nullptr; + dOutNew = ddOut = nullptr; + + // extract ddx(input) and out(input) + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + PADDLE_ENFORCE_NOT_NULL( + ddX, platform::errors::NotFound( + "Cannot get input Variable ddX, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + Out, platform::errors::NotFound( + "Cannot get input Variable Out, variable name = %s", + ctx.InputName("Out"))); + + // set output ddout + ddOut = ctx.Output("DDOut"); + + // extract dOut(intput) + dOut = ctx.Input("DOut"); + PADDLE_ENFORCE_NOT_NULL( + dOut, platform::errors::NotFound( + "Cannot get input Variable dOut, variable name = %s", + ctx.InputName("DOut"))); + + // set output dout_new + dOutNew = ctx.Output("DOutNew"); + + if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, dOutNew, ddOut); + } +}; + template class TanhDoubleGradKernel : public framework::OpKernel { @@ -2153,7 +2254,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor { } // namespace paddle #define FOR_EACH_ACTIVATION_OP(__macro) \ - __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index f368c658230555c5a3529b39dfc1b60b1cab56e4..cb3d85c1368bc4ffacf20aa24fa2722b56925186 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -35,10 +35,10 @@ class PowNPUKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("Power", {*x}, {*out}, - {{"power", factor}, - {"scale", static_cast(1.0)}, - {"shift", static_cast(0.0)}}); + const auto& runner = NpuOpRunner("Power", {*x}, {*out}, + {{"power", factor}, + {"scale", static_cast(1.0)}, + {"shift", static_cast(0.0)}}); auto stream = ctx.template device_context() @@ -68,8 +68,8 @@ class PowGradNPUKernel : public framework::OpKernel { // Step1: Compute x_pow = x.pow(factor-1) Tensor x_pow(x->type()); x_pow.mutable_data(x->dims(), place); - auto runner_pow = NpuOpRunner("Power", {*x}, {x_pow}, - {{"power", factor - static_cast(1)}}); + const auto& runner_pow = NpuOpRunner( + "Power", {*x}, {x_pow}, {{"power", factor - static_cast(1)}}); runner_pow.Run(stream); // Step 2: Construct a broadcast factor, which has the same shape with x. @@ -83,20 +83,21 @@ class PowGradNPUKernel : public framework::OpKernel { // factor. Tensor factor_bc_tensor(framework::proto::VarType::FP32); factor_bc_tensor.mutable_data(x_dims, place); - auto runner_bc = NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, - {{"dims", framework::vectorize(x_dims)}}); + const auto& runner_bc = + NpuOpRunner("FillD", {factor_tensor}, {factor_bc_tensor}, + {{"dims", framework::vectorize(x_dims)}}); runner_bc.Run(stream); // Step 3: Compute x_power_mul_factor = factor * x.pow(factor-1) Tensor x_power_mul_factor(x->type()); x_power_mul_factor.mutable_data(x->dims(), place); - auto runner_mul_1 = + const auto& runner_mul_1 = NpuOpRunner("Mul", {factor_bc_tensor, x_pow}, {x_power_mul_factor}, {}); runner_mul_1.Run(stream); // Step 4: Compute dx = dout * factor * x.pow(factor-1) dx->mutable_data(place); - auto runner_mul_2 = + const auto& runner_mul_2 = NpuOpRunner("Mul", {*dout, x_power_mul_factor}, {*dx}, {}); runner_mul_2.Run(stream); } @@ -111,11 +112,11 @@ class ReluNPUKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("Relu", - { - *x, - }, - {*out}, {}); + const auto& runner = NpuOpRunner("Relu", + { + *x, + }, + {*out}, {}); auto stream = ctx.template device_context() @@ -137,7 +138,7 @@ class ReluGradNPUKernel : public framework::OpKernel { .stream(); dx->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); + const auto& runner = NpuOpRunner("ReluGrad", {*dout, *out}, {*dx}, {}); runner.Run(stream); } @@ -159,7 +160,7 @@ class SqrtNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); + const auto& runner = NpuOpRunner("Sqrt", {*x}, {*out}, {}); runner.Run(stream); } }; @@ -181,8 +182,8 @@ class SqrtGradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto dx_runner = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); - dx_runner.Run(stream); + const auto& runner_dx = NpuOpRunner("SqrtGrad", {*out, *dout}, {*dx}, {}); + runner_dx.Run(stream); } }; @@ -204,16 +205,16 @@ class LogNPUKernel : public framework::OpKernel { Tensor one(x->type()); one.mutable_data(x->dims(), place); - auto one_runner = NpuOpRunner("OnesLike", {*x}, {one}, {}); - one_runner.Run(stream); + const auto& runner_one = NpuOpRunner("OnesLike", {*x}, {one}, {}); + runner_one.Run(stream); Tensor sub(x->type()); sub.mutable_data(x->dims(), place); - auto sub_runner = NpuOpRunner("Sub", {*x, one}, {sub}, {}); - sub_runner.Run(stream); + const auto& runner_sub = NpuOpRunner("Sub", {*x, one}, {sub}, {}); + runner_sub.Run(stream); - auto out_runner = NpuOpRunner("Log1p", {sub}, {*out}, {}); - out_runner.Run(stream); + const auto& runner_out = NpuOpRunner("Log1p", {sub}, {*out}, {}); + runner_out.Run(stream); } }; @@ -233,7 +234,7 @@ class LogGradNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); - auto runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); + const auto& runner = NpuOpRunner("DivNoNan", {*dout, *x}, {*dx}, {}); runner.Run(stream); } }; @@ -254,7 +255,7 @@ class TanhNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); + const auto& runner = NpuOpRunner("Tanh", {*x}, {*out}, {}); runner.Run(stream); } }; @@ -276,8 +277,8 @@ class TanhGradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto dx_runner = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); - dx_runner.Run(stream); + const auto& runner_dx = NpuOpRunner("TanhGrad", {*out, *dout}, {*dx}, {}); + runner_dx.Run(stream); } }; @@ -297,7 +298,7 @@ class SquareNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Square", {*x}, {*out}, {}); + const auto& runner = NpuOpRunner("Square", {*x}, {*out}, {}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc index fe5b08af52a624b29100635ee34cfac7c2d2a859..82436bdef16bcf59baeac2054f3cce3fd9a54047 100644 --- a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc +++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc @@ -29,7 +29,8 @@ class AllocFloatStatusKernel : public framework::OpKernel { auto* float_status = ctx.Output("FloatStatus"); float_status->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); + const auto& runner = + NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); auto stream = ctx.template device_context() .stream(); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 2c3a9c366e4fd010249248082f618a4893013da3..c699486a9140a388dc79359cf3cc40fc61e4f45b 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -39,33 +39,36 @@ __global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale, __syncthreads(); const int64_t num = s_starts[size]; - int pre_xs_index = 0; - bool t_found_inf = false; - const MT t_scale = *scale; + int xs_index = 0; + bool local_found_inf = false; + const MT local_scale = *scale; for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { - // get the xs's index of thread - int xs_index = pre_xs_index; - while (idx < s_starts[xs_index]) xs_index++; - // avoid some tensor's numel is zero - while (idx >= s_starts[xs_index]) xs_index++; - pre_xs_index = xs_index - 1; + // get the "out" index of "id" + // For example: + // idx = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= idx < 20 ==> + // the idx element locate in the 3rd tensor (notice the 2nd tensor size is + // 0) + int next_xs_index = xs_index; + while (idx >= s_starts[next_xs_index]) next_xs_index++; + xs_index = next_xs_index - 1; // get in data and out data - const T* in = xs[pre_xs_index]; - T* out = outs[pre_xs_index]; - int64_t in_idx = idx - s_starts[pre_xs_index]; + const T* in = xs[xs_index]; + T* out = outs[xs_index]; + int64_t in_idx = idx - s_starts[xs_index]; // Unscale - MT val = static_cast(in[in_idx]) * t_scale; + MT val = static_cast(in[in_idx]) * local_scale; T narrow_val = static_cast(val); out[in_idx] = narrow_val; // CheckFinite if (!isfinite(narrow_val)) { - t_found_inf = true; + local_found_inf = true; } } - if (t_found_inf) { + if (local_found_inf) { *found_inf = true; } } @@ -94,28 +97,30 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { scale_data, inverse_scale_v, found_inf_data); size_t xs_size = xs.size(); + const auto& cpu_place = platform::CPUPlace(); // calculate each tensor's start index and copy to device auto h_starts_tensor = - memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t)); + memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); auto d_starts_tensor = memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] h_starts[0] = 0; for (int i = 1; i <= xs_size; i++) { - // the start index value of each tensor is - // the sum of previous tensor's size h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); } int64_t total_num = h_starts[xs_size]; memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - d_starts, platform::CPUPlace(), h_starts, - (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()); + d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); // copy each tensor's data address to device - auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*)); + auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); const T** h_xs = reinterpret_cast(h_mem->ptr()); T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; @@ -128,16 +133,18 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, - platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*), - dev_ctx.stream()); + cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream()); // Launch Kernel - int block = 1024; - int block_num = block * 20; // each thread deal with 20 number - int grid = (total_num + block_num - 1) / block_num; + int threads_per_block = std::min(static_cast(1024), total_num); + int elements_per_block = + threads_per_block * 20; // each thread deal with 20 number + int blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale<<< - grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>( + CheckFiniteAndUnscale< + T, MPDType><<>>( d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); VLOG(3) << "finish kernel"; } diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc index 8fd45326e4ec6134cf4b98be12212ce8d7d74541..26280cd2bd1d32fedaa01d0b638fdcc89749bb76 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -42,13 +42,11 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { found_inf->mutable_data(ctx.GetPlace()); - bool found_inf_data = false; - auto stream = ctx.template device_context() .stream(); - // step1: inverse scale(RealDiv) + // step1: inverse scale Tensor const_tensor; const_tensor.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&const_tensor, static_cast(1.0)); @@ -58,7 +56,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { Tensor inverse_out(scale->type()); inverse_out.Resize(scale->dims()); inverse_out.mutable_data(ctx.GetPlace()); - auto runner_inverse = + const auto& runner_inverse = NpuOpRunner("Div", {const_tensor, *scale}, {inverse_out}, {}); runner_inverse.Run(stream); tmp_inverse_out = &inverse_out; @@ -66,55 +64,41 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { // NOTE(zhiqiu): Tensor tmp; tmp.mutable_data({8}, ctx.GetPlace()); - // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. // tmp is only placeholder. - auto runner_float_status = + const auto& runner_float_status = NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}, {{"message", std::string("check_nan_and_inf")}}); runner_float_status.Run(stream); Tensor sum; sum.mutable_data({1}, ctx.GetPlace()); - auto runner_reduce_sum = + const auto& runner_reduce_sum = NpuOpRunner("ReduceSumD", {*float_status}, {sum}, {{"axes", std::vector{0}}, {"keep_dims", true}}); runner_reduce_sum.Run(stream); - std::vector sum_vec; - TensorToVector( - sum, ctx.template device_context(), - &sum_vec); - found_inf_data = (sum_vec[0] > 1); - - VLOG(4) << "found_inf_data:" << found_inf_data; - + const auto& runner_greater = + NpuOpRunner("GreaterEqual", {sum, const_tensor}, {*found_inf}, {}); + runner_greater.Run(stream); + + // NOTE(zhiqiu): The normal logic is : + // out = in, if found_inf = true + // out = in/scale, if found_inf = false + // However, on NPU, in order to avoid stream sync, we do not copy the + // found_inf data to cpu to check whether to unscale or not. + // Instead, we do the Mul no matter found_inf or not. + // And, a fact is, only few steps contains nan/inf during training. for (size_t i = 0; i < xs.size(); ++i) { const auto* x = xs[i]; auto* out = outs[i]; out->mutable_data(ctx.GetPlace()); - if (!found_inf_data) { - // MatMul - auto runner_matmul = - NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); - runner_matmul.Run(stream); - } + const auto& runner_mul = + NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); + runner_mul.Run(stream); } - // set found_inf to true - VLOG(4) << "found overflow:" << found_inf_data; - Tensor found_inf_tensor; - found_inf_tensor.Resize({1}); - bool* is_found_inf = - found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); - *is_found_inf = found_inf_data; - - framework::TensorCopy( - found_inf_tensor, ctx.GetPlace(), - ctx.template device_context(), found_inf); - ctx.template device_context().Wait(); - - auto runner_clear_status = + const auto& runner_clear_status = NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp}); runner_clear_status.Run(stream); } diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..210f3e098f95f490f9c5d4adf53d9ee4f20f3e97 --- /dev/null +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_xpu.cc @@ -0,0 +1,170 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/operators/amp/check_finite_and_unscale_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { +template +class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + using XPUTyp = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = ctx.template device_context(); + const auto xs = ctx.MultiInput("X"); + const auto* scale = ctx.Input("Scale"); + auto outs = ctx.MultiOutput("Out"); + auto* found_inf = ctx.Output("FoundInfinite"); + + const MPDType* scale_data = scale->data(); + bool* found_inf_data = found_inf->mutable_data(dev_ctx.GetPlace()); + + // cpy to cpu + bool cpu_found_inf_data = false; + + MPDType cpu_scale_data; + if (platform::is_xpu_place(scale->place())) { + xpu_memcpy(&cpu_scale_data, scale_data, sizeof(MPDType), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_scale_data = (*scale_data); + } + MPDType inverse_scale = 1.0 / cpu_scale_data; + for (size_t i = 0; i < xs.size(); ++i) { + const auto* x = xs[i]; + auto* out = outs[i]; + out->mutable_data(dev_ctx.GetPlace()); + framework::Tensor is_finite = + ctx.AllocateTmpTensor(x->dims(), + dev_ctx); + framework::Tensor is_nan = + ctx.AllocateTmpTensor(x->dims(), + dev_ctx); + framework::Tensor is_finite_and_nan = + ctx.AllocateTmpTensor(x->dims(), + dev_ctx); + if (cpu_found_inf_data == false) { + int r = xpu::isfinite(dev_ctx.x_context(), + reinterpret_cast(x->data()), + is_finite.data(), x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(isfinite) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::logical_not(dev_ctx.x_context(), reinterpret_cast( + is_finite.data()), + is_finite.data(), x->numel()); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(logical_not) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::isnan(dev_ctx.x_context(), + reinterpret_cast(x->data()), + is_nan.data(), x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(isnan) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::logical_or(dev_ctx.x_context(), is_finite.data(), + is_nan.data(), is_finite.data(), + x->numel()); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU API(logical_or) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::any(dev_ctx.x_context(), is_finite.data(), + found_inf_data, x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(any) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + memory::Copy(platform::CPUPlace(), &cpu_found_inf_data, + BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + found_inf_data, sizeof(bool)); + } + + if (cpu_found_inf_data) { + inverse_scale = 0.0; + } + auto dev_env = XPUEnv::getenv("XPUSIM_DEVICE_MODEL"); + + if (std::is_same::value && + (dev_env == nullptr || std::strcmp(dev_env, "KUNLUN1"))) { + framework::Tensor float_x; + framework::Tensor float_out; + float_x.mutable_data(dev_ctx.GetPlace(), + x->numel() * sizeof(MPDType)); + float_out.mutable_data(dev_ctx.GetPlace(), + out->numel() * sizeof(MPDType)); + int r = xpu::cast_v2(dev_ctx.x_context(), + reinterpret_cast(x->data()), + float_x.data(), x->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::scale(dev_ctx.x_context(), float_x.data(), + float_out.data(), x->numel(), false, + inverse_scale, 0.0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::cast_v2(dev_ctx.x_context(), float_out.data(), + reinterpret_cast(out->data()), + out->numel()); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(cast_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + if (dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } + + } else { + int r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(x->data()), + reinterpret_cast(out->data()), + x->numel(), false, inverse_scale, 0.0); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } + } + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + found_inf_data, platform::CPUPlace(), &cpu_found_inf_data, + sizeof(bool)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_XPU_KERNEL(check_finite_and_unscale, + ops::CheckFiniteAndUnscaleXPUKernel, + ops::CheckFiniteAndUnscaleXPUKernel); + +#endif diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index b48b0e78892933bc76894611d0ae6d01c194d036..de1f83c1ee50d00960c50638fab5fd6cffca1a36 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -34,13 +34,39 @@ __global__ void GpuUpdateLossScaling( } template -__global__ void FillIf(T* data, const int64_t num, const T value, - const bool* has_inf) { - if (*has_inf) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < num; i += blockDim.x * gridDim.x) { - data[i] = value; - } +__global__ void FusedFillIf(T** outs, const size_t xs_size, + const int64_t* starts, const T value, + const bool* has_inf) { + if (!(*has_inf)) return; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t total_num = s_starts[xs_size]; + int out_index = 0; + + for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) { + // get the "out" index of "id" + // For example: + // id = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= id < 20 ==> + // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0) + int next_out_index = out_index; + while (id >= s_starts[next_out_index]) next_out_index++; + out_index = next_out_index - 1; + + // get data pointer and index + T* out_data = outs[out_index]; + int64_t idx = id - s_starts[out_index]; + + // set value + out_data[idx] = value; } } @@ -68,15 +94,52 @@ class LazyZeros { const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int64_t num = out->numel(); - int block = 1024; - int grid = (block - 1 + num) / block; - FillIf<<>>( - out_data, num, static_cast(0), found_inf_data); + size_t xs_size = xs.size(); + const auto& cpu_place = platform::CPUPlace(); + // alloc each tensor's start index and copy to device + auto h_in_starts_mem = + memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); + + auto d_in_starts_mem = + memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); + + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] + h_starts[0] = 0; + for (int i = 0; i < xs_size; i++) { + h_starts[i + 1] = h_starts[i] + outs[i]->numel(); } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); + + // copy each tensor of "outs" data address array to device + auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*)); + T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); + + auto d_out_addrs_mem = memory::Alloc(dev_ctx, xs_size * sizeof(T*)); + T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); + + for (size_t i = 0; i < xs_size; ++i) { + h_out_addrs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); + } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*), + dev_ctx.stream()); + + // launch cuda kernel + int64_t total_num = h_starts[xs_size]; + int64_t threads_per_block = std::min(static_cast(1024), total_num); + int64_t elements_per_block = + threads_per_block * 50; // each thread deal with 50 data + int64_t blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; + FusedFillIf<<>>( + d_out_addrs, xs_size, d_starts, static_cast(0), found_inf_data); } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index 45b28bf61e5d683a68ec0af58ebea0f4c6cc4871..6db18c46a09b85e08ffecc14ce86f8f20bb7713e 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/update_loss_scaling_op.h" #include #include +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/npu_op_runner.h" @@ -42,18 +43,18 @@ void Update(const platform::NPUDeviceContext& ctx, Tensor factor_tensor(bad_out_tensor->type()); factor_tensor.mutable_data({1}, place); FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); - auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, - {*bad_out_tensor}, {}); + const auto& runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, + {*bad_out_tensor}, {}); runner_p2.Run(stream); std::vector bad_out_data; TensorToVector(*bad_out_tensor, ctx, &bad_out_data); if (bad_out_data[0] == decr_every_n_nan_or_inf) { - auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", decr_ratio}, - {"shift", static_cast(0)}}); + const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", decr_ratio}, + {"shift", static_cast(0)}}); runner_p3.Run(stream); @@ -61,11 +62,11 @@ void Update(const platform::NPUDeviceContext& ctx, TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); if (new_loss_scaling[0] < static_cast(1)) { // updated_loss_scaling_data = 1 - auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(0)}, - {"shift", static_cast(1)}}); + const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(1)}}); runner_p4.Run(stream); } @@ -85,30 +86,30 @@ void Update(const platform::NPUDeviceContext& ctx, Tensor factor_tensor(good_out_tensor->type()); factor_tensor.mutable_data({1}, place); FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); - auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, - {*good_out_tensor}, {}); + const auto& runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, + {*good_out_tensor}, {}); runner_p2.Run(stream); std::vector good_out_data; TensorToVector(*good_out_tensor, ctx, &good_out_data); if (good_out_data[0] == incr_every_n_steps) { - auto runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", incr_ratio}, - {"shift", static_cast(0)}}); + const auto& runner_p3 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", incr_ratio}, + {"shift", static_cast(0)}}); runner_p3.Run(stream); std::vector new_loss_scaling; TensorToVector(*updated_loss_scaling_tensor, ctx, &new_loss_scaling); if (!std::isfinite(new_loss_scaling[0])) { // updated_loss_scaling_data = pre_loss_scaling_data - auto runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, - {*updated_loss_scaling_tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(1)}, - {"shift", static_cast(0)}}); + const auto& runner_p4 = NpuOpRunner("Power", {*pre_loss_scaling_tensor}, + {*updated_loss_scaling_tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(1)}, + {"shift", static_cast(0)}}); runner_p4.Run(stream); } @@ -145,16 +146,43 @@ class LazyZerosNPU { const std::vector found_inf_vec, const std::vector& xs, const std::vector& outs) const { + if (!xs.size()) { + return; + } + auto place = dev_ctx.GetPlace(); + auto stream = dev_ctx.stream(); + Tensor* zero_tensor; + void* zero_ptr; + if (found_inf_vec[0]) { + int max_num = -1; + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + int num = out->numel(); + if (max_num < num) { + max_num = num; + zero_tensor = out; + } + } + + zero_tensor->mutable_data(place); + const auto& runner_zeros = + NpuOpRunner("ZerosLike", {*zero_tensor}, {*zero_tensor}); + runner_zeros.Run(stream); + zero_tensor->check_memory_size(); + zero_ptr = zero_tensor->data(); + } + for (size_t i = 0; i < xs.size(); ++i) { auto* out = outs[i]; - if (found_inf_vec[0]) { - VLOG(4) << "-- UpdateLossScaling: Find infinite grads. --"; - - auto place = dev_ctx.GetPlace(); - auto stream = dev_ctx.stream(); - auto g = out->mutable_data(place); - platform::NPUMemsetAsync(static_cast(g), 0, - out->numel() * sizeof(T), stream); + auto* x = xs[i]; + auto dst_ptr = out->mutable_data(place); + if (!found_inf_vec[0]) { + framework::TensorCopy(*x, place, dev_ctx, out); + } else if (zero_ptr != dst_ptr) { + auto size = out->numel() * framework::SizeOfType(out->type()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, place), zero_ptr, size, + stream); } } } diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..1f05e5f246d9c564dbf53b121b07ff4beb84c686 --- /dev/null +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_xpu.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/operators/amp/update_loss_scaling_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +class UpdateLossScalingXPUKernel : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + using XPUTyp = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + const auto xs = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); + const auto* found_inf = ctx.Input("FoundInfinite"); + PADDLE_ENFORCE_EQ(found_inf->numel(), 1, + platform::errors::InvalidArgument( + "FoundInfinite must has only one element.")); + const bool* found_inf_data = found_inf->data(); + bool cpu_found_inf_data = false; + if (platform::is_xpu_place(found_inf->place())) { + xpu_memcpy(&cpu_found_inf_data, found_inf_data, sizeof(bool), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_found_inf_data = (*found_inf_data); + } + + for (size_t i = 0; i < xs.size(); ++i) { + auto* out = outs[i]; + T* out_data = out->mutable_data(dev_ctx.GetPlace()); + int num = out->numel(); + if (cpu_found_inf_data) { + VLOG(1) << "-- UpdateLossScaling: Find infinite grads. --"; + int r = 0; + r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(out_data), num, + XPUTyp(0.0)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(constant) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } + } + const bool stop_update = ctx.Attr("stop_update"); + if (stop_update) { + return; + } + + const auto* pre_loss_scaling = ctx.Input("PrevLossScaling"); + const auto* good_in = ctx.Input("InGoodSteps"); + const auto* bad_in = ctx.Input("InBadSteps"); + auto* updated_loss_scaling = ctx.Output("LossScaling"); + auto* good_out = ctx.Output("OutGoodSteps"); + auto* bad_out = ctx.Output("OutBadSteps"); + const MPDType* pre_loss_scaling_data = pre_loss_scaling->data(); + const int* good_in_data = good_in->data(); + const int* bad_in_data = bad_in->data(); + + MPDType* updated_loss_scaling_data = + updated_loss_scaling->mutable_data(dev_ctx.GetPlace()); + int* good_out_data = good_out->mutable_data(dev_ctx.GetPlace()); + int* bad_out_data = bad_out->mutable_data(dev_ctx.GetPlace()); + + const int incr_every_n_steps = ctx.Attr("incr_every_n_steps"); + const int decr_every_n_nan_or_inf = + ctx.Attr("decr_every_n_nan_or_inf"); + const float incr_ratio = ctx.Attr("incr_ratio"); + const float decr_ratio = ctx.Attr("decr_ratio"); + + int cpu_bad_in_data; + int cpu_good_in_data; + MPDType cpu_pre_loss_scaling_data; + if (platform::is_xpu_place(bad_in->place())) { + xpu_memcpy(&cpu_bad_in_data, bad_in_data, sizeof(int), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_bad_in_data = (*bad_in_data); + } + + if (platform::is_xpu_place(good_in->place())) { + xpu_memcpy(&cpu_good_in_data, good_in_data, sizeof(int), + XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_good_in_data = (*good_in_data); + } + + if (platform::is_xpu_place(pre_loss_scaling->place())) { + xpu_memcpy(&cpu_pre_loss_scaling_data, pre_loss_scaling_data, + sizeof(MPDType), XPUMemcpyKind::XPU_DEVICE_TO_HOST); + } else { + cpu_pre_loss_scaling_data = (*pre_loss_scaling_data); + } + + int cpu_good_out_data = 0; + int cpu_bad_out_data = 0; + MPDType cpu_updated_loss_scaling_data; + + if (cpu_found_inf_data) { + cpu_good_out_data = 0; + cpu_bad_out_data = cpu_bad_in_data + 1; + if (cpu_bad_out_data == decr_every_n_nan_or_inf) { + MPDType new_loss_scaling = cpu_pre_loss_scaling_data * decr_ratio; + cpu_updated_loss_scaling_data = + (new_loss_scaling < static_cast(1)) + ? (static_cast(1)) + : (new_loss_scaling); + cpu_bad_out_data = 0; + } + } else { + cpu_bad_out_data = 0; + cpu_good_out_data = cpu_good_in_data + 1; + if (cpu_good_out_data == incr_every_n_steps) { + MPDType new_loss_scaling = cpu_pre_loss_scaling_data * incr_ratio; + cpu_updated_loss_scaling_data = (std::isfinite(new_loss_scaling)) + ? new_loss_scaling + : cpu_pre_loss_scaling_data; + cpu_good_out_data = 0; + } + } + + // copy to host + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + bad_out_data, platform::CPUPlace(), &cpu_bad_out_data, + sizeof(int)); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + good_out_data, platform::CPUPlace(), &cpu_good_out_data, + sizeof(int)); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dev_ctx.GetPlace()), + updated_loss_scaling_data, platform::CPUPlace(), + &cpu_updated_loss_scaling_data, sizeof(MPDType)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(update_loss_scaling, + ops::UpdateLossScalingXPUKernel, + ops::UpdateLossScalingXPUKernel); +#endif diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index add533bafcb0a7f20c76f0844fb609d7af719bb1..433cabcfee0104a1112baa4aca6c18d072d8f696 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -162,6 +162,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, ops::AssignKernel, int, ops::AssignKernel, int64_t, ops::AssignKernel, bool, ops::AssignKernel, plat::float16, + ops::AssignKernel, plat::bfloat16, ops::AssignKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/assign_op_npu.cc b/paddle/fluid/operators/assign_op_npu.cc index 93689d5e495f33484d2f05b04d25734a8c5ab07e..4f4b7d544a0d8b44453a62b461cf52802aac83d2 100644 --- a/paddle/fluid/operators/assign_op_npu.cc +++ b/paddle/fluid/operators/assign_op_npu.cc @@ -43,7 +43,7 @@ class AssignNPUKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); + const auto& runner = NpuOpRunner("Assign", {*out, *x}, {*out}, {}); auto stream = ctx.template device_context() .stream(); diff --git a/paddle/fluid/operators/atan2_op.cc b/paddle/fluid/operators/atan2_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8ee6540bfa5f0c413f759f58ab506ac181c19c49 --- /dev/null +++ b/paddle/fluid/operators/atan2_op.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/atan2_op.h" + +#include +#include +#include +#include + +namespace paddle { +namespace operators { + +class Atan2Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X1"), "Input", "X1", "atan2"); + OP_INOUT_CHECK(ctx->HasInput("X2"), "Input", "X2", "atan2"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "atan2"); + + auto in_dims = ctx->GetInputDim("X1"); + + ctx->SetOutputDim("Out", in_dims); + } +}; + +class Atan2OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X1", "(Tensor), The input tensor of atan2 op."); + AddInput("X2", "(Tensor), The input tensor of atan2 op."); + AddOutput("Out", "(Tensor), The output tensor of atan2 op."); + AddComment(R"DOC( +Atan2 Operator. + +This operator is used to perform elementwise atan2 for input $X1$, $X2$. +$$out = atan2(x1, x2)$$ + +)DOC"); + } +}; + +class Atan2GradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X1"), "Input", "X1", "Atan2Grad"); + OP_INOUT_CHECK(ctx->HasInput("X2"), "Input", "X2", "Atan2Grad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@Grad", "Atan2Grad"); + + auto x1_grad_name = framework::GradVarName("X1"); + auto x2_grad_name = framework::GradVarName("X2"); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + if (ctx->HasOutput(x1_grad_name)) { + ctx->SetOutputDim(framework::GradVarName("X1"), dout_dims); + } + if (ctx->HasOutput(x2_grad_name)) { + ctx->SetOutputDim(framework::GradVarName("X2"), dout_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X1"); + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +template +class Atan2GradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("atan2_grad"); + retv->SetInput("X1", this->Input("X1")); + retv->SetInput("X2", this->Input("X2")); + retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X1"), this->InputGrad("X1")); + retv->SetOutput(framework::GradVarName("X2"), this->InputGrad("X2")); + } +}; + +class Atan2OpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + auto type = ctx->GetInputDataType("X1"); + if (ctx->GetInputDataType("X1") == framework::proto::VarType::INT32 || + ctx->GetInputDataType("X1") == framework::proto::VarType::INT64 || + ctx->GetInputDataType("X2") == framework::proto::VarType::INT32 || + ctx->GetInputDataType("X2") == framework::proto::VarType::INT64) { + type = framework::proto::VarType::FP64; + } + ctx->SetOutputDataType("Out", type); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(atan2, ops::Atan2Op, ops::Atan2OpMaker, + ops::Atan2GradMaker, + ops::Atan2GradMaker, + ops::Atan2OpVarTypeInference); + +REGISTER_OPERATOR(atan2_grad, ops::Atan2GradOp); + +REGISTER_OP_CPU_KERNEL( + atan2, ops::Atan2Kernel, + ops::Atan2Kernel, + ops::Atan2Kernel, + ops::Atan2Kernel, + ops::Atan2Kernel); + +REGISTER_OP_CPU_KERNEL( + atan2_grad, ops::Atan2GradKernel, + ops::Atan2GradKernel, + ops::Atan2GradKernel); diff --git a/paddle/fluid/operators/atan2_op.cu b/paddle/fluid/operators/atan2_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..faf1fde47e4c45a00836eee1d81ed1233170ecbe --- /dev/null +++ b/paddle/fluid/operators/atan2_op.cu @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/atan2_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + atan2, ops::Atan2Kernel, + ops::Atan2Kernel, + ops::Atan2Kernel, + ops::Atan2Kernel, + ops::Atan2Kernel); + +REGISTER_OP_CUDA_KERNEL( + atan2_grad, + ops::Atan2GradKernel, + ops::Atan2GradKernel, + ops::Atan2GradKernel); diff --git a/paddle/fluid/operators/atan2_op.h b/paddle/fluid/operators/atan2_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8ed0fda843d4732c80d62077b1591b9b0c9c125b --- /dev/null +++ b/paddle/fluid/operators/atan2_op.h @@ -0,0 +1,168 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using framework::To32BitIndex; + +template +struct Atan2Out { + using type = T; +}; + +template <> +struct Atan2Out { + using type = double; +}; + +template <> +struct Atan2Out { + using type = double; +}; + +template +struct Atan2Functor { + Atan2Functor(const T* x1, const T* x2, typename Atan2Out::type* out, + int64_t numel) + : x1_(x1), x2_(x2), out_(out), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + out_[idx] = static_cast::type>( + ::atan2f(static_cast(x1_[idx]), static_cast(x2_[idx]))); + } + + const T* x1_; + const T* x2_; + typename Atan2Out::type* out_; + int64_t numel_; +}; + +template <> +struct Atan2Functor { + Atan2Functor(const double* x1, const double* x2, double* out, int64_t numel) + : x1_(x1), x2_(x2), out_(out), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + out_[idx] = ::atan2(x1_[idx], x2_[idx]); + } + + const double* x1_; + const double* x2_; + double* out_; + int64_t numel_; +}; + +// dx1 = dout * x2 / ((x1)^2 + (x2)^2) +// dx2 = - dout * x1 / ((x1)^2 + (x2)^2) +template +struct Atan2GradFunctor { + Atan2GradFunctor(const T* x1, const T* x2, const T* dout, T* dx1, T* dx2, + int64_t numel) + : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + float x1 = static_cast(x1_[idx]); + float x2 = static_cast(x2_[idx]); + float x = x1 * x1 + x2 * x2; + dx1_[idx] = static_cast(static_cast(dout_[idx]) * x2 / x); + dx2_[idx] = static_cast(-static_cast(dout_[idx]) * x1 / x); + } + + const T* x1_; + const T* x2_; + const T* dout_; + T* dx1_; + T* dx2_; + int64_t numel_; +}; + +template <> +struct Atan2GradFunctor { + Atan2GradFunctor(const double* x1, const double* x2, const double* dout, + double* dx1, double* dx2, int64_t numel) + : x1_(x1), x2_(x2), dout_(dout), dx1_(dx1), dx2_(dx2), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + auto x = x1_[idx] * x1_[idx] + x2_[idx] * x2_[idx]; + dx1_[idx] = dout_[idx] * x2_[idx] / x; + dx2_[idx] = -dout_[idx] * x1_[idx] / x; + } + + const double* x1_; + const double* x2_; + const double* dout_; + double* dx1_; + double* dx2_; + int64_t numel_; +}; + +template +class Atan2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* X1 = context.Input("X1"); + const Tensor* X2 = context.Input("X2"); + Tensor* Out = context.Output("Out"); + + auto numel = X1->numel(); + auto x1 = X1->data(); + auto x2 = X2->data(); + auto out = Out->mutable_data::type>( + context.GetPlace(), size_t(numel * sizeof(typename Atan2Out::type))); + auto& dev_ctx = context.template device_context(); + + platform::ForRange for_range(dev_ctx, numel); + Atan2Functor functor(x1, x2, out, numel); + for_range(functor); + } +}; + +template +class Atan2GradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const { + const Tensor* X1 = context.Input("X1"); + const Tensor* X2 = context.Input("X2"); + const Tensor* dOut = context.Input(framework::GradVarName("Out")); + Tensor* dX1 = context.Output(framework::GradVarName("X1")); + Tensor* dX2 = context.Output(framework::GradVarName("X2")); + + auto numel = X1->numel(); + auto x1 = X1->data(); + auto x2 = X2->data(); + auto dout = dOut->data(); + auto dx1 = + dX1->mutable_data(context.GetPlace(), size_t(numel * sizeof(T))); + auto dx2 = + dX2->mutable_data(context.GetPlace(), size_t(numel * sizeof(T))); + auto& dev_ctx = context.template device_context(); + + platform::ForRange for_range(dev_ctx, numel); + Atan2GradFunctor functor(x1, x2, dout, dx1, dx2, numel); + for_range(functor); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index fc31885824b55f22bba77559d728a1e40d47e784..edad20435b41c9eb59c3df793c00ab3bfe96771b 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -575,7 +575,7 @@ class BatchNormGradKernel // SavedVariance have been reverted in forward operator const auto *saved_inv_variance = ctx.Input("SavedVariance"); const std::string data_layout_str = ctx.Attr("data_layout"); - const bool use_global_stats = ctx.Attr("use_global_stats"); + bool use_global_stats = ctx.Attr("use_global_stats"); const bool is_test = ctx.Attr("is_test"); const float epsilon = ctx.Attr("epsilon"); const DataLayout data_layout = @@ -585,6 +585,8 @@ class BatchNormGradKernel auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + use_global_stats = is_test || use_global_stats; + // batch_norm with inplace as false will take X as grad input, which // is same as cuDNN batch_norm backward calculation, batch_norm // with inplace as true only take Y as input and X should be calculate @@ -605,13 +607,6 @@ class BatchNormGradKernel "X@GRAD and Y@GRAD inplaced in non-inplace mode")); } - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - // Get the size for each dimension. // NCHW [batch_size, in_channels, in_height, in_width] const auto &x_dims = x->dims(); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 41dc87ac1ba4788b89ad0a0dd01c7aba981fd746..42e1e2e7463c7753fbf205c88442db63733754ea 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -225,11 +225,17 @@ class BatchNormKernel #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; } else { mode_ = CUDNN_BATCHNORM_SPATIAL; } #else - mode_ = CUDNN_BATCHNORM_SPATIAL; + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } #endif // CUDNN_VERSION_MIN(7, 0, 1) VLOG(3) << "Setting descriptors."; @@ -382,8 +388,8 @@ class BatchNormKernel } // Run training mode. - // obtain running mean and running inv var, and see if we need to - // initialize them. + // obtain running mean and running inv var, and there is no need + // to initialize them. auto *mean_out = ctx.Output("MeanOut"); auto *variance_out = ctx.Output("VarianceOut"); @@ -394,10 +400,6 @@ class BatchNormKernel auto *saved_variance = ctx.Output("SavedVariance"); saved_mean->mutable_data>(ctx.GetPlace()); saved_variance->mutable_data>(ctx.GetPlace()); - math::SetConstant> - functor; - functor(dev_ctx, saved_mean, static_cast>(0)); - functor(dev_ctx, saved_variance, static_cast>(0)); if ((N * H * W * D) == 1) { // Only 1 element in normalization dimension, @@ -817,7 +819,7 @@ class BatchNormGradKernel platform::errors::InvalidArgument("It must use CUDAPlace.")); double epsilon = static_cast(ctx.Attr("epsilon")); const std::string data_layout_str = ctx.Attr("data_layout"); - const bool use_global_stats = ctx.Attr("use_global_stats"); + bool use_global_stats = ctx.Attr("use_global_stats"); const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); @@ -850,12 +852,7 @@ class BatchNormGradKernel } const bool is_test = ctx.Attr("is_test"); - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); + use_global_stats = is_test || use_global_stats; const auto &x_dims = x->dims(); @@ -998,11 +995,17 @@ class BatchNormGradKernel #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; } else { mode_ = CUDNN_BATCHNORM_SPATIAL; } #else - mode_ = CUDNN_BATCHNORM_SPATIAL; + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } #endif // CUDNN_VERSION_MIN(7, 0, 1) #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt index 54008336a9f67f0123ba1cfa6fcea35b79b7ac4c..e5023d8eb354aedd221d9b4e86963a5b8d30390b 100644 --- a/paddle/fluid/operators/benchmark/CMakeLists.txt +++ b/paddle/fluid/operators/benchmark/CMakeLists.txt @@ -1,3 +1,3 @@ cc_test(op_tester SRCS op_tester.cc op_tester_config.cc DEPS memory timer framework_proto proto_desc lod_tensor op_registry - device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} eigen_function) diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..074607e05ea7d5b85134f36818ae407ddc73c465 --- /dev/null +++ b/paddle/fluid/operators/broadcast_tensors_op.cc @@ -0,0 +1,253 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/broadcast_tensors_op.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { +using framework::Tensor; +using framework::DDim; + +class BroadcastTensorsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); + OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out", + "broadcast_tensors"); + + int target_rank = 0; + const auto& input_dims = ctx->GetInputsDim("X"); + // 1. Find Output rank = max(Inputs rank) + for (const auto& input_ddim : input_dims) { + target_rank = std::max(target_rank, input_ddim.size()); + } + + PADDLE_ENFORCE_GT( + target_rank, 0, + platform::errors::InvalidArgument( + "BroadcastTensorsOp requires at least one input tensor" + "to have rank greater than zero")); + + std::vector target_dims(target_rank, 0); + // 2. Output dim(axis=x) = max(Inputs dim(axis=x)) + for (int index = 0; index < target_rank; index++) { + // Loop axes in reverse order, + // For each axis, take the maximum as target size + // Fill size = 1 if shape vector exhausts + int target_dim_size = 1; + for (const auto& input_ddim : input_dims) { + // Reversed order + int axis = static_cast(input_ddim.size()) - index - 1; + int dim_size = 1; + if (axis >= 0) { + dim_size = input_ddim[axis]; + } + + // We performed bcast semantics check at python level + // So input tensors should all have legal shape + target_dim_size = std::max(target_dim_size, dim_size); + } + target_dims[target_rank - index - 1] = target_dim_size; + } + + // 3. Set Output Dim + std::vector output_ddims; + for (size_t i = 0; i < input_dims.size(); i++) { + output_ddims.emplace_back(framework::make_ddim(target_dims)); + } + ctx->SetOutputsDim("Out", output_ddims); + ctx->ShareAllLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + // Broadcast semantics enforces all input variables having the same + // DataType/VarType + // This condition is also checked during VarType Inference + // Here we simply copy input type to output + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A Varaible list. The shape and data type of the list elements" + "should be consistent. Variable can be multi-dimensional Tensor" + "or LoDTensor, and data types can be: bool, float16, float32, " + "float64, int32, " + "int64.") + .AsDuplicable(); + AddOutput("Out", + "the sum of input :code:`x`. its shape and data types are " + "consistent with :code:`x`.") + .AsDuplicable(); + AddComment( + R"DOC(This OP is used to broadcast a vector of inputs + with Tensor or LoDTensor type, following broadcast semantics.)DOC"); + } +}; + +class BroadcastTensorsOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + // We need at least two tensors to satisfy broadcast semantics + size_t input_size = ctx->InputSize("X"); + PADDLE_ENFORCE_GT( + input_size, 0, + platform::errors::InvalidArgument( + "BroadcastTensorsOp should have at least one input variables," + "but only received %d ", + input_size)); + + // BroadcastTensorsOp takes a vector of variables named "X" + // Here we loop through input variables, + // and check if their DataType/VarType are the same + auto var_type = ctx->GetInputType("X", 0); + auto data_type = ctx->GetInputDataType("X", 0); + for (size_t ind = 1; ind < input_size; ind++) { + auto cur_var_type = ctx->GetInputType("X", ind); + PADDLE_ENFORCE_EQ( + var_type, cur_var_type, + platform::errors::InvalidArgument( + "inputs to BroadcastTensorsOp should have the same variable type," + "but detected %d v.s %d ", + framework::ToTypeName(var_type), + framework::ToTypeName(cur_var_type))); + + auto cur_data_type = ctx->GetInputDataType("X", ind); + PADDLE_ENFORCE_EQ( + data_type, cur_data_type, + platform::errors::InvalidArgument( + "inputs to BroadcastTensorsOp should have the same data type," + "but detected %d v.s %d ", + framework::ToTypeName(var_type), + framework::ToTypeName(cur_var_type))); + } + + // Outputs having the same DataType/VarType as inputs + ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS); + ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS); + } +}; + +/* ------ BroadcastTensorsGradOp ------ */ +class BroadcastTensorsGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")), "Output", + "X@grad", "broadcast_tensors"); + OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors"); + OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Out")), "Input", + "Out@grad", "broadcast_tensors"); + + const auto& forward_input_dims = ctx->GetInputsDim("X"); + ctx->SetOutputsDim(framework::GradVarName("X"), forward_input_dims); + ctx->ShareAllLoD("X", /*->*/ framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.device_context()); + } +}; + +template +class BroadcastTensorsGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("broadcast_tensors_grad"); + // We need "X" only for backward shape inference + grad_op->SetInput("X", this->Input("X")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("X"), + this->InputGrad("X", /* drop_empty_grad */ false)); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +class BroadcastTensorsGradOpVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + auto var_type = ctx->GetInputType("X", 0); + auto data_type = ctx->GetInputDataType("X", 0); + + ctx->SetOutputType(framework::GradVarName("X"), var_type, + framework::ALL_ELEMENTS); + ctx->SetOutputDataType(framework::GradVarName("X"), data_type, + framework::ALL_ELEMENTS); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer, + "X"); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp, + ops::BroadcastTensorsOpMaker, + ops::BroadcastTensorsGradOpMaker, + ops::BroadcastTensorsGradOpMaker, + ops::BroadcastTensorsOpVarTypeInference); + +REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp, + ops::BroadcastTensorsGradOpVarTypeInference, + ops::BroadcastTensorsGradNoNeedBufVarsInferer); + +REGISTER_OP_CPU_KERNEL( + broadcast_tensors, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel); + +REGISTER_OP_CPU_KERNEL( + broadcast_tensors_grad, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel, + ops::BroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..d670e1b333d411daa8e107356fdba62812a38bee --- /dev/null +++ b/paddle/fluid/operators/broadcast_tensors_op.cu @@ -0,0 +1,132 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/broadcast_tensors_op.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/operators/reduce_ops/cub_reduce.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using framework::DDim; + +template +struct IdentityFunctor { + HOSTDEVICE explicit inline IdentityFunctor() {} + + template + HOSTDEVICE inline Tout operator()(const U& x) const { + return static_cast(x); + } +}; + +template +class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // Find reduce dimensions + const auto& in_tensors = + context.MultiInput(framework::GradVarName("Out")); + auto out_tensors = context.MultiOutput(framework::GradVarName("X")); + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, 1, + platform::errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, out_tensors.size(), + platform::errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + auto* input_tensor = in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const DDim& input_dims = input_tensor->dims(); + const DDim& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // Collect reduce_dims + // Example: + // dX = [1,1,1,1] + // dOut = [1,1,1,4] + // + // reduce_dims = [3] // reduce along the broadcasted axis + std::vector reduce_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + bool just_copy = (reduce_dims_vec.size() == 0); + output_tensor->mutable_data(context.GetPlace()); + if (just_copy) { + // Turns out to be a No-Op, simply copy tensors + framework::TensorCopy(*input_tensor, context.GetPlace(), + context.device_context(), output_tensor); + } else { + // reduce_sum implementation on CUDA + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input_tensor, output_tensor, reduce_dims_vec, static_cast(0), + cub::Sum(), IdentityFunctor(), stream); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + broadcast_tensors, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel, + ops::BroadcastTensorsOpKernel); + +REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel, + ops::CUDABroadcastTensorsGradOpKernel); diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0eeb9234df0fee76f2f4233803b1a4bd517ff583 --- /dev/null +++ b/paddle/fluid/operators/broadcast_tensors_op.h @@ -0,0 +1,282 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/operators/math/math_function.h" + +#define SWITCH_OUT_RANK_CASE(n) \ + case n: { \ + ApplyBroadcast(context, in_tensors[i], out_tensors[i]); \ + break; \ + } + +namespace paddle { +namespace operators { + +using framework::Tensor; +using framework::DDim; +using framework::EigenTensor; + +template +class BroadcastTensorsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto& in_tensors = context.MultiInput("X"); + auto out_tensors = context.MultiOutput("Out"); + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, 1, + platform::errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, out_tensors.size(), + platform::errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, out_tensors.size())); + + // Eigen has no support for dynamic ranked tensor + // Thus we perform static expansion for each possible ranks + for (size_t i = 0; i < num_ins; i++) { + int out_rank = out_tensors[i]->dims().size(); + switch (out_rank) { + SWITCH_OUT_RANK_CASE(1) + SWITCH_OUT_RANK_CASE(2) + SWITCH_OUT_RANK_CASE(3) + SWITCH_OUT_RANK_CASE(4) + SWITCH_OUT_RANK_CASE(5) + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Target tensor rank out of range" + "Maximum supported rank for broadcast is: 5")); + } + } + } + } + + template + void ApplyBroadcast(const framework::ExecutionContext& context, + const Tensor* input_tensor, Tensor* output_tensor) const { + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // 1. Collect bcast_dims, each element of which indicates how many + // times we need to replicate along the corresponding dimension + // 2. Collect new_input_dims_vec. Eigen::broadcast requires same rank for + // both input and output tensors, so we need to initialize input X with + // expanded dims: "new_input_dims_vec" + Eigen::DSizes bcast_dims; + std::vector new_input_dims_vec(out_rank); + for (int j = 0; j < out_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + bcast_dims[out_axis] = output_dims[out_axis]; + new_input_dims_vec[out_axis] = 1; + if (in_axis >= 0 && input_dims[in_axis] == output_dims[out_axis]) { + bcast_dims[out_axis] = 1; + new_input_dims_vec[out_axis] = input_dims[in_axis]; + } + } + auto new_input_dims = framework::make_ddim(new_input_dims_vec); + + // Initialize input X with new_input_dims_vec, so it's rank-aligned with the + // output + auto x = EigenTensor::From(*input_tensor, new_input_dims); + + output_tensor->mutable_data(context.GetPlace()); + auto y = EigenTensor::From(*output_tensor, output_dims); + + auto& place = + *context.template device_context().eigen_device(); + EigenBroadcast, T, OutRank>::Eval(place, y, x, + bcast_dims); + } +}; + +#define SWITCH_RESHAPE_DIMS(n) \ + case n: { \ + Eigen::DSizes reshape_dims; \ + for (size_t i = 0; i < reshape_dims_vec.size(); ++i) { \ + reshape_dims[i] = reshape_dims_vec[i]; \ + } \ + dX.device(place) = \ + dOut.reshape(reshape_dims).sum(reduce_dims).reshape(dX.dimensions()); \ + break; \ + } + +#define UPPER_SWITCH_REDUCE_DIMS(m) \ + case m: { \ + Eigen::DSizes reduce_dims; \ + for (size_t i = 0; i < reduce_dims_vec.size(); ++i) { \ + reduce_dims[i] = reduce_dims_vec[i]; \ + } \ + switch (reshape_size) { +#define LOWER_SWITCH_REDUCE_DIMS \ + default: { \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Detected reshape size: %d out of range" \ + "Minimum value should be larger than reduce size %d" \ + "While maximum supported is: 5", \ + reshape_size, reduce_size)); \ + } \ + } \ + break; \ + } + +/* ----- GradOpKernel ----- */ +template +class BroadcastTensorsGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // Find reduce dimensions + const auto& in_tensors = + context.MultiInput(framework::GradVarName("Out")); + auto out_tensors = context.MultiOutput(framework::GradVarName("X")); + + size_t num_ins = in_tensors.size(); + + PADDLE_ENFORCE_GT( + num_ins, 1, + platform::errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + in_tensors.size())); + + PADDLE_ENFORCE_EQ( + num_ins, out_tensors.size(), + platform::errors::InvalidArgument( + "BroadcastTensorsOp expects equal number of inputs and outputs," + "but received: %d inputs v.s %d outputs", + num_ins, out_tensors.size())); + + // For each In-Out tensor pair, + // Prepare and apply broadcast dims array + for (size_t i = 0; i < num_ins; i++) { + const auto* input_tensor = in_tensors[i]; + auto* output_tensor = out_tensors[i]; + + const auto& input_dims = input_tensor->dims(); + const auto& output_dims = output_tensor->dims(); + + int in_rank = input_dims.size(); + int out_rank = output_dims.size(); + + // BroadcastTensorsGrad is simply a reduce_sum along broadcasted axes + // Here we perform the following Eigen operations: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + // Note the last "reshape(dX_shape)" will be performed implicitly, + // and we only need to collect reduce_dims and reshape_dims + std::vector reduce_dims_vec; + std::vector reshape_dims_vec; + for (int j = 0; j < in_rank; j++) { + int out_axis = out_rank - j - 1; + int in_axis = in_rank - j - 1; + + reshape_dims_vec.push_back(input_dims[j]); + if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) { + reduce_dims_vec.push_back(in_axis); + } + } + + size_t reduce_size = reduce_dims_vec.size(); + size_t reshape_size = reshape_dims_vec.size(); + bool just_copy = (reduce_dims_vec.size() == 0); + output_tensor->mutable_data(context.GetPlace()); + if (just_copy) { + // If this turns out to be a No-Op, simply perform a tensor copy + framework::TensorCopy(*input_tensor, context.GetPlace(), + context.device_context(), output_tensor); + } else { + PADDLE_ENFORCE_GE(reduce_dims_vec.size(), 1, + platform::errors::InvalidArgument( + "The number of dimensions of the input " + "'Out@GRAD' for Op(broadcast_tensors)" + " must be greater than or equal to 1, but " + "the value received is %d.", + reduce_dims_vec.size())); + PADDLE_ENFORCE_LE( + reduce_dims_vec.size(), 5, + platform::errors::InvalidArgument( + "The number of dimensions of the input 'Out@GRAD' " + "for Op(broadcast_tensors) must be less than or equal " + "to 5, but the value received is %d.", + reduce_dims_vec.size())); + + // Overall: + // dOut(Flattened) -> reshape(reshape_dims) -> reduce(reduce_dims) -> + // reshape(dX_shape) -> dX + auto dX = framework::EigenVector::Flatten(*output_tensor); + auto dOut = framework::EigenVector::Flatten(*input_tensor); + auto& place = + *context.template device_context().eigen_device(); + + // Expand ReduceSize and ReshapeSize into static values + switch (reduce_size) { + UPPER_SWITCH_REDUCE_DIMS(1) + SWITCH_RESHAPE_DIMS(1) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(2) + SWITCH_RESHAPE_DIMS(2) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(3) + SWITCH_RESHAPE_DIMS(3) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(4) + SWITCH_RESHAPE_DIMS(4) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + UPPER_SWITCH_REDUCE_DIMS(5) + SWITCH_RESHAPE_DIMS(5) + LOWER_SWITCH_REDUCE_DIMS + + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "Detected reduce size: %d out of range" + "While maximum supported is: 5", + reduce_size)); + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 40f4b969ec060d8453d176db67a6eb20933c6b3e..952e9ca329f102566d14cbf9180001e4ae5aef35 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -27,6 +27,9 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "The output tensor of cast op"); AddAttr("out_dtype", "output data type"); AddAttr("in_dtype", "input data type"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( Cast Operator. @@ -50,6 +53,7 @@ class CastOpGradMaker : public framework::SingleGradOpMaker { grad->SetOutput("Out", this->InputGrad("X")); grad->SetAttr("out_dtype", this->GetAttr("in_dtype")); grad->SetAttr("in_dtype", this->GetAttr("out_dtype")); + grad->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn")); } }; @@ -77,6 +81,28 @@ class CastOp : public framework::OperatorWithKernel { if (platform::is_cuda_pinned_place(tensor_place)) { return framework::OpKernelType(tensor->type(), ctx.device_context()); } + +#ifdef PADDLE_WITH_MKLDNN + int in_dtype = ctx.Attr("in_dtype"); + int out_dtype = ctx.Attr("out_dtype"); + + auto MKLDNNSupportsCast = [&]() -> bool { + int dtype_fp32 = static_cast(framework::proto::VarType::FP32); + int dtype_bf16 = static_cast(framework::proto::VarType::BF16); + + if ((in_dtype != dtype_fp32 && in_dtype != dtype_bf16) || + (out_dtype != dtype_fp32 && out_dtype != dtype_bf16)) + return false; + + return true; + }; + + if (this->CanMKLDNNBeUsed(ctx, tensor->type()) && MKLDNNSupportsCast()) { + return framework::OpKernelType(tensor->type(), ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif return framework::OpKernelType(tensor->type(), tensor_place); } }; @@ -90,13 +116,11 @@ REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker, ops::CastOpGradMaker, ops::CastOpProtoMaker); -REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel); +REGISTER_OP_CPU_KERNEL( + cast, ops::CastOpKernel, ops::CastOpKernel, + ops::CastOpKernel, ops::CastOpKernel, + ops::CastOpKernel, ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel>, + ops::CastOpKernel>); diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 13759633d0168a4d38796a88fe8db215cfcfe380..1ac110b3cafd6bfd9da29daaebb65df570a02cb0 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -95,6 +95,7 @@ struct CastOpFunctor { namespace ops = paddle::operators; +#ifdef PADDLE_WITH_HIP REGISTER_OP_CUDA_KERNEL( cast, ops::CastOpKernel, ops::CastOpKernel, @@ -105,6 +106,23 @@ REGISTER_OP_CUDA_KERNEL( ops::CastOpKernel, ops::CastOpKernel, + paddle::platform::complex>, ops::CastOpKernel); + paddle::platform::complex>); +#else +REGISTER_OP_CUDA_KERNEL( + cast, ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel>, + ops::CastOpKernel>); +#endif diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc index 0de0f5e4505795f69f1d80e2bbc1600250fc7391..4efaecbe9a5b809192c50fd6341577f04bd1b247 100644 --- a/paddle/fluid/operators/cast_op_npu.cc +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -78,8 +78,8 @@ class CastNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Cast", {*x}, {*out}, - {{"dst_type", static_cast(aclDtype)}}); + const auto& runner = NpuOpRunner( + "Cast", {*x}, {*out}, {{"dst_type", static_cast(aclDtype)}}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index ca15858cf67d756fc8eb41f4e26a2e0b923abef6..c7c0f81f2131f73d0d9f89a7871550aab38cece8 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -23,21 +23,9 @@ limitations under the License. */ namespace paddle { namespace operators { -template -class XPUFPTypeTrait { - public: - using Type = T; -}; - -template <> -class XPUFPTypeTrait { - public: - using Type = float16; -}; - template class CastXPUKernel : public framework::OpKernel { - using XPUInTDType = typename XPUFPTypeTrait::Type; + using XPUInTDType = typename XPUTypeTrait::Type; public: void Compute(const framework::ExecutionContext& context) const override { @@ -49,7 +37,6 @@ class CastXPUKernel : public framework::OpKernel { context.Attr("out_dtype")); auto* in_data = in->data(); - // using XPUOutTDType = typename XPUFPTypeTrait::Type; auto numel = in->numel(); auto& dev_ctx = context.template device_context(); int r = -1; diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 153fa529f96a5980c6b95baedce6a6dcc0b26f6e..6ea8809dae13f2340a9664aab0213a7d89e5b3dc 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -69,6 +69,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { auto in_tensors = context.MultiInput("Input"); bool use_align = context.Attr("use_align"); + auto align_size = context.Attr("align_size"); if (context.Attr("check_name")) { for (size_t i = 0; i < in_var_names.size(); ++i) { @@ -95,7 +96,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { context.Attr("dtype")); size_t size_of_dtype = framework::SizeOfType(dtype); GetMemSizeAndDtype(in_tensors, in_var_names, &numel, size_of_dtype, - context.GetPlace(), use_align); + context.GetPlace(), use_align, align_size); // Alloc the continuous space auto fused_tensor = context.Output("FusedOutput"); @@ -113,13 +114,14 @@ class CoalesceTensorOpKernel : public framework::OpKernel { framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); - offset += - use_align - ? platform::Alignment(len * size_of_dtype, context.GetPlace()) / - size_of_dtype - : len; + offset += use_align + ? platform::Alignment(len * size_of_dtype, + context.GetPlace(), align_size) / + size_of_dtype + : len; } } else if (context.Attr("set_constant")) { + // TODO(Liu yuang) ADD NPU SET_CONSTANT FUNCTION. math::SetConstant set_constant; set_constant(dev_ctx, fused_tensor, static_cast(context.Attr("constant"))); @@ -133,11 +135,11 @@ class CoalesceTensorOpKernel : public framework::OpKernel { framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); } - offset += - use_align - ? platform::Alignment(len * size_of_dtype, context.GetPlace()) / - size_of_dtype - : len; + offset += use_align + ? platform::Alignment(len * size_of_dtype, + context.GetPlace(), align_size) / + size_of_dtype + : len; } } @@ -145,21 +147,31 @@ class CoalesceTensorOpKernel : public framework::OpKernel { offset = 0; std::stringstream ss; ss << "alloc_space_for_vars: "; + for (size_t i = 0; i < out_tensors.size(); ++i) { size_t len = static_cast(out_tensors[i]->numel()); auto dim = out_tensors[i]->dims(); + VLOG(4) << len << " " << dim << " " << offset; out_tensors[i] ->ShareDataWith(fused_tensor->Slice( static_cast(offset), static_cast(offset + len))) .Resize(dim); len = use_align - ? platform::Alignment(len * size_of_dtype, context.GetPlace()) / + ? platform::Alignment(len * size_of_dtype, context.GetPlace(), + align_size) / size_of_dtype : len; - offset += len; ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")" - << " address: " << out_tensors[i]->data() << ", "; + << " address: " << out_tensors[i]->data() << " len: " << len + << ", "; + offset += len; } + PADDLE_ENFORCE_EQ( + (int64_t)offset, fused_tensor->numel(), + platform::errors::InvalidArgument( + "The alloc_space_for_vars's offset: %s is unequal with " + "fused_tensor's numel: %s.", + offset, fused_tensor->numel())); VLOG(10) << ss.str(); } @@ -168,7 +180,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { const std::vector &lod_tensors, const std::vector var_names, size_t *numel, const size_t &size_of_dtype, const platform::Place &place, - const bool use_align = true) const { + const bool use_align = true, const int align_size = -1) const { PADDLE_ENFORCE_EQ( lod_tensors.size(), var_names.size(), platform::errors::InvalidArgument( @@ -188,16 +200,19 @@ class CoalesceTensorOpKernel : public framework::OpKernel { size, 0, platform::errors::InvalidArgument( "The number of tensor `%s`'s elements is 0.", var_names[i])); + auto len = + use_align + ? platform::Alignment(static_cast(size) * size_of_dtype, + place, align_size) / + size_of_dtype + : static_cast(size); + VLOG(4) << size << " " << len; ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() << ") " - << " addres:" << lod_tensors[i]->data() << ", "; - *numel += use_align - ? platform::Alignment( - static_cast(size) * size_of_dtype, place) / - size_of_dtype - : static_cast(size); + << " addres:" << lod_tensors[i]->data() << " len: " << len + << ", "; + *numel += len; } - VLOG(10) << ss.str(); } }; @@ -206,7 +221,42 @@ class CoalesceTensorOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override {} + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->IsRuntime()) { + return; + } + auto use_align = ctx->Attrs().Get("use_align"); + auto align_size = ctx->Attrs().Get("align_size"); + + auto dtype = static_cast( + ctx->Attrs().Get("dtype")); + size_t size_of_dtype = framework::SizeOfType(dtype); + + auto alignment = [](size_t size, size_t align_size) { + size_t remaining = size % align_size; + auto aligned_size = + remaining == 0 ? size : size + (align_size - remaining); + VLOG(4) << remaining << " " << size << " " << align_size << " " + << aligned_size; + return aligned_size; + }; + VLOG(4) << "align_size: " << align_size; + if (use_align && align_size > 0) { + int64_t numel = 0; + auto dims = ctx->GetInputsDim("Input"); + for (const auto &dim : dims) { + auto size = framework::product(dim); + auto len = use_align + ? alignment(static_cast(size) * size_of_dtype, + align_size) / + size_of_dtype + : static_cast(size); + numel += len; + } + ctx->SetOutputDim("FusedOutput", framework::make_ddim({numel})); + VLOG(4) << "FusedOutput size:" << framework::make_ddim({numel}); + } + } protected: framework::OpKernelType GetKernelTypeForVar( @@ -256,6 +306,8 @@ class CoalesceTensorOpMaker : public framework::OpProtoAndCheckerMaker { "Whether to consider memory chunk and take alignment into " "account for inputs and outputs.") .SetDefault(true); + AddAttr("align_size", "The alignment size when use_align is True") + .SetDefault(-1); AddComment(R"DOC( CoalesceTensor Operator. @@ -299,6 +351,16 @@ REGISTER_OP_CUDA_KERNEL( ops::CoalesceTensorOpKernel); #endif +#if defined(PADDLE_WITH_ASCEND_CL) +REGISTER_OP_CUDA_KERNEL( + coalesce_tensor, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); +#endif + #ifdef PADDLE_WITH_XPU REGISTER_OP_XPU_KERNEL( coalesce_tensor, @@ -309,6 +371,16 @@ REGISTER_OP_XPU_KERNEL( ops::CoalesceTensorOpKernel); #endif +#if defined(PADDLE_WITH_ASCEND_CL) +REGISTER_OP_NPU_KERNEL( + coalesce_tensor, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel, + ops::CoalesceTensorOpKernel); +#endif + REGISTER_OP_VERSION(coalesce_tensor) .AddCheckpoint( R"ROC( @@ -318,4 +390,14 @@ REGISTER_OP_VERSION(coalesce_tensor) "In order to optionally take memory alignment into account when " "coalescing tensors. The default value is true to be compatible " "with before.", - true)); + true)) + .AddCheckpoint( + R"ROC( + Upgrade coalesce_tensor: add a new attribute [align_size].)ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "align_size", + "In order to optionally take memory alignment into account when " + "coalescing tensors. The default value is -1 and use the default " + "align_size " + "of each place to be compatible with before.", + -1)); diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c57b9f99676337c88d6a51927195eeedb8b0a2a --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/alltoall_op.h" + +namespace paddle { +namespace operators { + +class AllToAllOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllToAll"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AllToAll"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for alltoall op must be non-negative.", ring_id)); + framework::DDim dim = ctx->GetInputDim("X"); + if (dim[0] < 0) dim[0] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class AllToAllOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor send."); + AddOutput("Out", "(Tensor) the result of alltoall."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddComment(R"DOC( +AllToAll Operator +Scatter tensors from all participators to all participators. +)DOC"); + } +}; + +template +class AllToAllOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("alltoall"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(alltoall, ops::AllToAllOp, ops::AllToAllOpMaker, + ops::AllToAllOpGradMaker, + ops::AllToAllOpGradMaker, + ops::AllToAllInplaceInferer) + +REGISTER_OP_CPU_KERNEL(alltoall, ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel); diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..1bcb47fc686cfe4b93420697b15d0c2585f0358e --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/alltoall_op.h" + +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class AllToAllOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_NCCL) +#if NCCL_VERSION_CODE >= 2703 + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + int send_numel = x->numel(); + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for alltoall op must be non-negative.", ring_id)); + auto place = ctx.GetPlace(); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + cudaStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + framework::DDim x_dims = x->dims(); + framework::DDim out_dims(x_dims); + PADDLE_ENFORCE_EQ( + x_dims[0] % nranks, 0, + platform::errors::InvalidArgument( + "The first dimension size (%d) of the input tensor must be " + "divisible by the number of ranks (%d).", + x_dims[0], nranks)); + auto send_buf = x->data(); + auto recv_buf = out->mutable_data(out_dims, place); + size_t offset = 0; + send_numel /= nranks; + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < nranks; ++i) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + send_buf + offset, send_numel, dtype, i, comm->comm(), stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv( + recv_buf + offset, send_numel, dtype, i, comm->comm(), stream)); + offset += send_numel; + } + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); +#else + PADDLE_THROW( + platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +#endif +#else + PADDLE_THROW( + platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/alltoall_op.h b/paddle/fluid/operators/collective/alltoall_op.h new file mode 100644 index 0000000000000000000000000000000000000000..61eec44093794ccaf820d257d7c2c6b363e10391 --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +namespace paddle { +namespace operators { + +template +class AllToAllOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support alltoall for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc index f6281aa8ca2710bd7281088f5d477278c93fe328..b8631b44f14caac162dd332f715b825e42bf31af 100644 --- a/paddle/fluid/operators/collective/barrier_op.cu.cc +++ b/paddle/fluid/operators/collective/barrier_op.cu.cc @@ -43,12 +43,10 @@ class BarrierOpCUDAKernel : public framework::OpKernel { ncclRedOp_t nccl_red_type = ncclSum; PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); - auto comm_stream = - platform::NCCLCommContext::Instance().Get(rid, place)->stream(); #ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(comm_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); #else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(comm_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); #endif #else PADDLE_THROW(platform::errors::Unavailable( diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 0eaa377869ef6d64e90c468f6d68e8d911969db9..3a74f551e7a30ed64104f8054a4e063fa816944e 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -131,6 +131,7 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { int64_t numel = in->numel(); void* sendbuff = reinterpret_cast(const_cast(in->data())); + out->mutable_data(in->dims(), ctx.GetPlace()); void* recvbuff = reinterpret_cast(out->data()); int ring_id = ctx.Attr("ring_id"); diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc index 7817f19bacb1879517d4865165836f46e4b68e75..3df0595525941a93b0fb4a63014021ad519651cf 100644 --- a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -22,7 +22,11 @@ class Scope; } // namespace framework } // namespace paddle #if defined(PADDLE_WITH_ASCEND_CL) +#include "acl/acl.h" +#include "hccl/hccl.h" +#include "hccl/hccl_types.h" #include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" #endif namespace paddle { @@ -57,6 +61,33 @@ class CCommInitOpAscend : public framework::OperatorBase { } platform::HCCLCommContext::Instance().CreateHCCLComm( hccl_id, rank_ids, rank_id, device_id, rid); + + // Build comm + float* buff; + int32_t size = 20; + std::vector input(size, 0); + for (int32_t idx = 0; idx < size; idx++) { + input[idx] = 1.0; + } + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMalloc(reinterpret_cast(&buff), + size * sizeof(float), + ACL_MEM_MALLOC_HUGE_FIRST)); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy( + reinterpret_cast(buff), size * sizeof(float), input.data(), + size * sizeof(float), ACL_MEMCPY_HOST_TO_DEVICE)); + VLOG(3) << "Build buff data successful."; + + aclrtStream stream = nullptr; + auto comm = paddle::platform::HCCLCommContext::Instance().Get(rid, place); + if (rank_id == 0) { + stream = comm->stream(); + } else { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + buff, size, HCCL_DATA_TYPE_FP32, 0, comm->comm(), stream)); + VLOG(3) << "Build connection successful."; #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with NPU.")); diff --git a/paddle/fluid/operators/collective/c_embedding_op.cc b/paddle/fluid/operators/collective/c_embedding_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3055e2ceb23dd239cf98188aa81a0d783b4f9e96 --- /dev/null +++ b/paddle/fluid/operators/collective/c_embedding_op.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_embedding_op.h" + +namespace paddle { +namespace operators { + +class CEmbeddingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CEmbeddingOp"); + OP_INOUT_CHECK(ctx->HasInput("Ids"), "Input", "Ids", "CEmbeddingOp"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CEmbeddingOp"); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + VLOG(5) << "ids rank is " << ids_rank << std::endl; + PADDLE_ENFORCE_EQ(table_dims.size(), 2, + platform::errors::InvalidArgument( + "The dimensions of the 'c_embedding' must be 2. " + "But received c_embedding's dimensions = %d, " + "c_embedding's shape = [%s].", + table_dims.size(), table_dims)); + + auto output_dims = framework::vectorize(ids_dims); + output_dims.push_back(table_dims[1]); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "W"); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int64 " + "contains the ids to be looked up in W."); + AddOutput("Out", "The lookup results, which have the same type as W."); + + AddAttr("start_index", + "(int64, default 0), The starting index is indeed, " + "and the out-of-bounds will be set to 0 ") + .SetDefault(0); + AddComment(R"DOC( +c_embedding Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(CEmbeddingGradOpNoBufferVarsInferer, "W"); + +template +class CEmbeddingGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("c_embedding_grad"); + + op->SetInput("W", this->Input("W")); + op->SetInput("Ids", this->Input("Ids")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("W"), this->InputGrad("W")); + + op->SetAttrMap(this->Attrs()); + } +}; + +class CEmbeddingOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + auto out_var_name = framework::GradVarName("W"); + VLOG(3) << "c_embedding_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + ctx->SetOutputType(out_var_name, framework::proto::VarType::LOD_TENSOR); + ctx->SetOutputDataType(out_var_name, ctx->GetInputDataType("W")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(c_embedding, ops::CEmbeddingOp, ops::CEmbeddingOpMaker, + ops::CEmbeddingGradOpMaker, + ops::CEmbeddingGradOpMaker); + +REGISTER_OPERATOR(c_embedding_grad, ops::CEmbeddingOpGrad, + ops::CEmbeddingGradOpNoBufferVarsInferer, + ops::CEmbeddingOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(c_embedding, ops::CEmbeddingOpCPUKernel, + ops::CEmbeddingOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ecf3887eef4ac6a8af7538789ec5fc56691b83bb --- /dev/null +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -0,0 +1,161 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/collective/c_embedding_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void CEmbedding(T *out, const T *table, const IndexT *ids, + const int rows, const int columns, const int64_t N, + const int64_t start_idx, const int64_t end_idx, + const int64_t limit) { + CUDA_KERNEL_LOOP(i, limit) { + size_t row = i / columns; + size_t col = i % columns; + auto id = ids[row]; + + if (id >= start_idx && id < end_idx) { + auto real_idx = id - start_idx; + PADDLE_ENFORCE(real_idx < N, + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be less than [%d], but received [%d]", + N, real_idx); + out[i] = table[real_idx * columns + col]; + } else { + out[i] = static_cast(0); + } + } +} + +template +__global__ void CEmbeddingGrad(T *table, const T *output, const IndexT *ids, + const int rows, const int columns, + const int64_t N, const int64_t start_idx, + const int64_t end_idx, const int64_t limit) { + CUDA_KERNEL_LOOP(i, limit) { + size_t row = i / columns; + size_t col = i % columns; + auto id = ids[row]; + if (id >= start_idx && id < end_idx) { + auto real_idx = id - start_idx; + paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col], + output[i]); + } + } +} + +template +class CEmbeddingCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_t = context.Input("W"); + auto *ids_t = context.Input("Ids"); + auto *output_t = context.Output("Out"); + + const auto &dev_ctx = + context.template device_context(); + const int64_t start_idx = context.Attr("start_index"); + size_t N = table_t->dims()[0]; + size_t D = table_t->dims()[1]; + size_t K = ids_t->numel(); + + const int64_t end_idx = start_idx + N; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + auto limit = K * D; + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + + const auto &index_type = ids_t->type(); + if (index_type == framework::proto::VarType::INT32) { + CEmbedding<<>>( + output, table, ids_t->data(), K, D, N, start_idx, end_idx, + limit); + + } else if (index_type == framework::proto::VarType::INT64) { + CEmbedding<<>>( + output, table, ids_t->data(), K, D, N, start_idx, end_idx, + limit); + } + } +}; + +template +class CEmbeddingGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const auto &dev_ctx = + context.template device_context(); + const int64_t start_idx = context.Attr("start_index"); + auto ids_t = context.Input("Ids"); + auto d_output_t = context.Input(framework::GradVarName("Out")); + auto d_table_t = context.Output(framework::GradVarName("W")); + + int N = d_table_t->dims()[0]; + int D = d_table_t->dims()[1]; + int K = ids_t->numel(); + + const int64_t end_idx = start_idx + N; + auto limit = K * D; + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + + const T *d_output = d_output_t->data(); + T *d_table = d_table_t->mutable_data(context.GetPlace()); + + auto t = framework::EigenVector::Flatten(*d_table_t); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); + + const auto &index_type = ids_t->type(); + if (index_type == framework::proto::VarType::INT32) { + CEmbeddingGrad<<>>( + d_table, d_output, ids_t->data(), K, D, N, start_idx, + end_idx, limit); + } else if (index_type == framework::proto::VarType::INT64) { + CEmbeddingGrad<<>>( + d_table, d_output, ids_t->data(), K, D, N, start_idx, + end_idx, limit); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(c_embedding, ops::CEmbeddingCUDAKernel, + ops::CEmbeddingCUDAKernel, + ops::CEmbeddingCUDAKernel); +REGISTER_OP_CUDA_KERNEL(c_embedding_grad, ops::CEmbeddingGradCUDAKernel, + ops::CEmbeddingGradCUDAKernel, + ops::CEmbeddingGradCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_embedding_op.h b/paddle/fluid/operators/collective/c_embedding_op.h new file mode 100644 index 0000000000000000000000000000000000000000..3cab6d7184441df4c87382904e7a1d35caddfbca --- /dev/null +++ b/paddle/fluid/operators/collective/c_embedding_op.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +template +class CEmbeddingOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_embedding for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc index 593eaf923a978402cc7607bb7d2bc4a6419dd2cb..af1e576a8c74f509822a1f227976c6a2ad803d82 100644 --- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -23,15 +23,35 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_ASCEND_CL -#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" -#endif +#include "paddle/fluid/platform/dynload/hccl.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" namespace paddle { namespace operators { #ifdef PADDLE_WITH_ASCEND_CL +static void GenHCCLID(std::vector* hccl_ids) { + for (size_t i = 0; i < hccl_ids->size(); ++i) { + PADDLE_ENFORCE_NPU_SUCCESS( + platform::dynload::HcclGetRootInfo(&(*hccl_ids)[i])); + } +} + +static void CopyHCCLIDToVar(const std::vector& hccl_ids, + std::function func, + const framework::Scope& scope) { + for (size_t i = 0; i < hccl_ids.size(); ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + memcpy(hccl_id, &hccl_ids[i], sizeof(HcclRootInfo)); + } +} + class CGenHCCLIdOp : public framework::OperatorBase { public: CGenHCCLIdOp(const std::string& type, @@ -49,14 +69,22 @@ class CGenHCCLIdOp : public framework::OperatorBase { return Output("Out"); }; + std::string endpoint = Attr("endpoint"); + int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); + + std::vector hccl_ids; + hccl_ids.resize(1); + if (rank == 0) { + GenHCCLID(&hccl_ids); std::vector endpoint_list = Attr>("other_endpoints"); - SendBroadCastHCCLID(endpoint_list, 1, func, local_scope); + platform::SendBroadCastCommID(endpoint_list, &hccl_ids); } else { - std::string endpoint = Attr("endpoint"); - RecvBroadCastHCCLID(endpoint, 1, func, local_scope); + platform::RecvBroadCastCommID(server_fd, endpoint, &hccl_ids); } + + CopyHCCLIDToVar(hccl_ids, func, scope); scope.DeleteScope(&local_scope); } }; diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc index 8ccf40e317aded44154f3b5046db5cec44260dce..05bb3830b601fbb6cb9be38de258b56776fafad4 100644 --- a/paddle/fluid/operators/collective/c_identity_op.cu.cc +++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc @@ -14,35 +14,11 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_identity_op.h" -namespace paddle { -namespace operators { - -template -class CIdentityOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto x = ctx.Input("X"); - auto out = ctx.Output("Out"); - - int rid = ctx.Attr("ring_id"); - PADDLE_ENFORCE_GE( - rid, 0, - platform::errors::InvalidArgument( - "The ring_id (%d) for c_identity op must be non-negative.", rid)); - out->mutable_data(ctx.GetPlace()); - - TensorCopy(*x, out->place(), out); - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpCUDAKernel, - ops::CIdentityOpCUDAKernel, - ops::CIdentityOpCUDAKernel, - ops::CIdentityOpCUDAKernel, - ops::CIdentityOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h index ca817fb6bac0e1a3c2a11b93f927ea979bfd7256..c8577a9617489887167dbc7d9ae008608f1be48e 100644 --- a/paddle/fluid/operators/collective/c_identity_op.h +++ b/paddle/fluid/operators/collective/c_identity_op.h @@ -34,5 +34,23 @@ class CIdentityOpCPUKernel : public framework::OpKernel { } }; +template +class CIdentityOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int rid = ctx.Attr("ring_id"); + PADDLE_ENFORCE_GE( + rid, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_identity op must be non-negative.", rid)); + out->mutable_data(ctx.GetPlace()); + + TensorCopy(*x, out->place(), out); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/l1_norm_op.cu b/paddle/fluid/operators/collective/c_identity_op_npu.cc similarity index 55% rename from paddle/fluid/operators/l1_norm_op.cu rename to paddle/fluid/operators/collective/c_identity_op_npu.cc index a5c29bbf5debdd11f6e5b28b3a8b48c2c484517a..a822bd11a4a8332111d6c0813a377fa214a0c390 100644 --- a/paddle/fluid/operators/l1_norm_op.cu +++ b/paddle/fluid/operators/collective/c_identity_op_npu.cc @@ -1,21 +1,21 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/l1_norm_op.h" + +#include "paddle/fluid/operators/collective/c_identity_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - l1_norm, ops::L1NormKernel); -REGISTER_OP_CUDA_KERNEL( - l1_norm_grad, - ops::L1NormGradKernel); +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_identity, ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel, + ops::CIdentityOpKernel); diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f75e1b3c7aedccbd0405ae26a952aa0b19b40a6d --- /dev/null +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h" + +namespace paddle { +namespace operators { + +class CSoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits", + "CSoftmaxWithCrossEntropyOp"); + OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", + "CSoftmaxWithCrossEntropyOp"); + + OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax", + "CSoftmaxWithCrossEntropyOp"); + OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", + "CSoftmaxWithCrossEntropyOp"); + + auto logits_dims = ctx->GetInputDim("Logits"); + auto labels_dims = ctx->GetInputDim("Label"); + + auto logits_rank = logits_dims.size(); + auto axis = logits_rank - 1; + for (int i = 0; i < logits_rank; i++) { + if (i != axis) { + if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) { + PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i], + platform::errors::InvalidArgument( + "Input(Logits) and Input(Label) should in " + "same shape in dimensions except axis.")); + } + } + } + + PADDLE_ENFORCE_EQ( + labels_dims[logits_rank - 1], 1UL, + platform::errors::InvalidArgument( + "the last dimension of Input(Label) should be 1." + "But received: the last dimension of Input(Label) is [%d]," + "the last dimension is [%d]", + labels_dims[logits_rank - 1], logits_rank - 1)); + + ctx->SetOutputDim("Softmax", logits_dims); + + logits_dims[axis] = 1; + ctx->SetOutputDim("Loss", logits_dims); + + ctx->ShareLoD("Logits", /*->*/ "Softmax"); + ctx->ShareLoD("Logits", /*->*/ "Loss"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Logits"), + ctx.device_context()); + } +}; + +class CSoftmaxWithCrossEntropyOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("Logits", + "(Tensor, default: Tensor), The input tensor of unscaled " + "log probabilities, whose dimension :attr:`axis` should be scaled " + "by softmax."); + AddInput( + "Label", + "(Tensor) The input tensor of groud truth label. If :attr:`soft_label` " + "is set to false, Label is a Tensor in same shape with " + "Input(Logits) except the shape in dimension :attr:`axis` as 1. If " + "soft_label is set to true, Label is a Tensor in same " + "shape with Input(Logits)."); + AddOutput( + "Softmax", + "(Tensor, default: Tensor), A tensor in same shape with " + "Input(Logits). " + "The outputs value of softmax activation by given the input batch, " + "which will be used in backward calculation."); + AddOutput("Loss", + "(Tensor, default: Tensor), A tensor in same shape with " + "Input(Logits) " + "except the shape in dimension :attr:`axis` as 1. The cross " + "entropy loss."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr("rank", + "(int default 0) rank id for CSoftmaxWithCrossEntropy.") + .SetDefault(0); + AddAttr("nranks", + "(int default 1) nranks id for CSoftmaxWithCrossEntropy.") + .SetDefault(0); + AddComment(R"DOC( +CSoftmaxWithCrossEntropy Operator + +)DOC"); + } +}; + +class CSoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true, + platform::errors::InvalidArgument( + "Input(Loss@Grad) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true, + platform::errors::InvalidArgument( + "Input(Softmax) should be not null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Label"), true, + platform::errors::InvalidArgument("Input(Label) should be not null.")); + + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true, + platform::errors::InvalidArgument( + "Output(Logits@Grad) should be not null.")); + + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Softmax")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Loss")), + ctx.device_context()); + } +}; + +template +class CSoftmaxWithCrossEntropyOpGradMaker + : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("c_softmax_with_cross_entropy_grad"); + + op->SetInput("Softmax", this->Output("Softmax")); + op->SetInput("Label", this->Input("Label")); + op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); + op->SetAttrMap(this->Attrs()); + op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits")); + } +}; + +DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyInplaceInferer, + {"Logits", "Softmax"}); + +DECLARE_INPLACE_OP_INFERER(CSoftmaxWithCrossEntropyGradInplaceInferer, + {"Softmax", framework::GradVarName("Logits")}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR( + c_softmax_with_cross_entropy, ops::CSoftmaxWithCrossEntropyOp, + ops::CSoftmaxWithCrossEntropyOpMaker, + ops::CSoftmaxWithCrossEntropyOpGradMaker, + ops::CSoftmaxWithCrossEntropyOpGradMaker, + ops::CSoftmaxWithCrossEntropyInplaceInferer); + +REGISTER_OPERATOR(c_softmax_with_cross_entropy_grad, + ops::CSoftmaxWithCrossEntropyOpGrad, + ops::CSoftmaxWithCrossEntropyGradInplaceInferer); + +REGISTER_OP_CPU_KERNEL(c_softmax_with_cross_entropy, + ops::CSoftmaxWithCrossEntropyOpCPUKernel, + ops::CSoftmaxWithCrossEntropyOpCPUKernel, + ops::CSoftmaxWithCrossEntropyOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..77db86e7111112ac78bea270413ee9a2c2cba72b --- /dev/null +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -0,0 +1,262 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/softmax_impl.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void MaskLabelByIndex(T* predicted_logits, const T* logit, + const IndexT* label, const int start_index, + const int end_index, const int64_t N, + const int64_t D, const int nranks) { + CUDA_KERNEL_LOOP(i, N) { + auto real_label = label[i]; + PADDLE_ENFORCE((real_label < D * nranks) && (real_label >= 0), + "The index is out of bounds, " + "please check whether the value of label and " + "input meet the class number. It should " + "be less than [%d], but received [%d]", + D * nranks, real_label); + + if (real_label >= start_index && real_label < end_index) { + predicted_logits[i] = logit[i * D + real_label - start_index]; + } + } +} + +template +__global__ void MaskLabelByIndexGrad(T* logits_grad, const T* loss_grad, + const IndexT* labels, + const int start_index, const int end_index, + const int64_t N, const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + auto col = i % D; + if ((col + start_index) == labels[row]) { + logits_grad[i] = (logits_grad[i] - static_cast(1.0)) * loss_grad[row]; + } else { + logits_grad[i] *= loss_grad[row]; + } + } +} + +template +class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* logits = ctx.Input("Logits"); + const Tensor* labels = ctx.Input("Label"); + Tensor* softmax = ctx.Output("Softmax"); + Tensor* loss = ctx.Output("Loss"); + + const int rid = ctx.Attr("ring_id"); + const int nranks = ctx.Attr("nranks"); + const int rank = ctx.Attr("rank"); + + const auto& place = ctx.GetPlace(); + const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); + auto& dev_ctx = ctx.template device_context(); + + // use global calculate stream + const auto stream = static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + + // allocate memory on device. + softmax->mutable_data(place); + loss->mutable_data(place); + + const auto& logits_dims = logits->dims(); + const auto& labels_dims = labels->dims(); + + const int axis = logits_dims.size() - 1; + const int N = SizeToAxis(axis, logits_dims); + const int D = SizeFromAxis(axis, logits_dims); + + Tensor logits_2d, softmax_2d, loss_2d; + logits_2d.ShareDataWith(*logits).Resize({N, D}); + softmax_2d.ShareDataWith(*softmax).Resize({N, D}); + loss_2d.ShareDataWith(*loss).Resize({N, 1}); + + auto eigen_logits = math::EigenMatrix::From(logits_2d); + auto eigen_softmax = math::EigenMatrix::From(softmax_2d); + + // step 1, obtain logit_max + Tensor logits_max; + logits_max = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + void* logits_max_buff = logits_max.mutable_data(place); + + auto eigen_logits_max = math::EigenMatrix::From(logits_max); + Eigen::DSizes along_axis(1); + eigen_logits_max.device(*dev_ctx.eigen_device()) = + eigen_logits.maximum(along_axis); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + logits_max_buff, logits_max_buff, logits_max.numel(), + platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(), + stream)); + + // step 2, obtain logit - logit_max + Eigen::DSizes batch_by_one(N, 1); + Eigen::DSizes one_by_class(1, D); + + eigen_softmax.device(*dev_ctx.eigen_device()) = + (eigen_logits - + eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class)) + .unaryExpr(math::ValueClip()); + + // step 3, obtain predict target + Tensor predicted_logits; + predicted_logits = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + predicted_logits.mutable_data(place); + + auto t = framework::EigenVector::Flatten(predicted_logits); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); + + const int start_index = rank * D; + const int end_index = start_index + D; + + int blocks = NumBlocks(N); + int threads = kNumCUDAThreads; + const auto& label_type = labels->type(); + + if (label_type == framework::proto::VarType::INT32) { + MaskLabelByIndex<<>>( + predicted_logits.data(), softmax_2d.data(), + labels->data(), start_index, end_index, N, D, nranks); + } else if (label_type == framework::proto::VarType::INT64) { + MaskLabelByIndex<<>>( + predicted_logits.data(), softmax_2d.data(), + labels->data(), start_index, end_index, N, D, nranks); + } + + void* predict_logits_buff = predicted_logits.mutable_data(place); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + predict_logits_buff, predict_logits_buff, predicted_logits.numel(), + platform::ToNCCLDataType(predicted_logits.type()), ncclSum, + comm->comm(), stream)); + + // step 4, obtain exp(logit) + eigen_softmax.device(*dev_ctx.eigen_device()) = eigen_softmax.exp(); + + // step 5, obtain sum_exp_logits + Tensor sum_exp_logits; + sum_exp_logits = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + void* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); + + auto eigen_sum_exp_logits = math::EigenMatrix::From(sum_exp_logits); + eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) = + eigen_softmax.sum(along_axis); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), + platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, comm->comm(), + stream)); + + auto eigen_loss = math::EigenMatrix::From(loss_2d); + auto eigen_predicted_logits = math::EigenMatrix::From(predicted_logits); + + eigen_loss.device(*dev_ctx.eigen_device()) = + (eigen_sum_exp_logits.log().unaryExpr(math::TolerableValue()) - + eigen_predicted_logits) + .unaryExpr(math::TolerableValue()); + + eigen_softmax.device(*dev_ctx.eigen_device()) = + (eigen_softmax * + eigen_sum_exp_logits.inverse().broadcast(one_by_class)); + } +}; + +template +class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* labels = context.Input("Label"); + const Tensor* loss_grad = + context.Input(framework::GradVarName("Loss")); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + const Tensor* softmax = context.Input("Softmax"); + const int rank = context.Attr("rank"); + auto& dev_ctx = + context.template device_context(); + + if (logit_grad != softmax) { + framework::TensorCopy(*softmax, context.GetPlace(), + context.device_context(), logit_grad); + } + const auto sofrmax_dims = softmax->dims(); + const int axis = sofrmax_dims.size() - 1; + const int N = SizeToAxis(axis, sofrmax_dims); + const int D = SizeFromAxis(axis, sofrmax_dims); + + Tensor logit_grad_2d; + logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D}); + + int blocks = NumBlocks(N * D); + int threads = kNumCUDAThreads; + const auto& label_type = labels->type(); + const int start_index = rank * D; + const int end_index = start_index + D; + + if (label_type == framework::proto::VarType::INT32) { + MaskLabelByIndexGrad<<>>( + logit_grad_2d.data(), loss_grad->data(), + labels->data(), start_index, end_index, N, D); + } else if (label_type == framework::proto::VarType::INT64) { + MaskLabelByIndexGrad<<>>( + logit_grad_2d.data(), loss_grad->data(), + labels->data(), start_index, end_index, N, D); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + c_softmax_with_cross_entropy, + ops::CSoftmaxWithCrossEntropyOpCUDAKernel, + ops::CSoftmaxWithCrossEntropyOpCUDAKernel, + ops::CSoftmaxWithCrossEntropyOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL( + c_softmax_with_cross_entropy_grad, + ops::CSoftmaxWithCrossEntropyGradCUDAKernel, + ops::CSoftmaxWithCrossEntropyGradCUDAKernel, + ops::CSoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..c7cfd41fa2556873166701c96616323d2b1e40c3 --- /dev/null +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/softmax_op.h" + +namespace paddle { +namespace operators { + +template +class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_embedding for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc index 03046d571d0f0542ff714868205d5a0aa285e685..37ec989f3f981227e37deb277c32301926723ed5 100644 --- a/paddle/fluid/operators/collective/c_split_op.cc +++ b/paddle/fluid/operators/collective/c_split_op.cc @@ -45,6 +45,12 @@ class CSplitOp : public framework::OperatorWithKernel { rank, nranks)); framework::DDim dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ( + dim[dim.size() - 1] % nranks, 0, + platform::errors::InvalidArgument("The last dimension (%d) of the X " + "should be divisible by nranks (%d)", + dim[dim.size() - 1], nranks)); + dim[dim.size() - 1] = dim[dim.size() - 1] / nranks; if (dim[0] < 0) dim[0] = -1; ctx->SetOutputDim("Out", dim); diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu similarity index 65% rename from paddle/fluid/operators/collective/c_split_op.cu.cc rename to paddle/fluid/operators/collective/c_split_op.cu index 92a7f5e41b1d2d8a1e3f4582ad014f630010c8ca..034accbb480c78be767e5b2900ccc376cfa5f635 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu.cc +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -16,10 +16,38 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_split_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { namespace operators { +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void SplitFromRank(const T* input, T* output, const int rows, + const int columns, const int rank, + const int nranks, const int limit) { + CUDA_KERNEL_LOOP(i, limit) { + int row = i / columns; + int col = i % columns; + + int block = columns / nranks; + int start = block * rank; + int end = start + block; + + if (col >= start && col < end) { + int idx = block * row + col % block; + output[idx] = input[i]; + } + } +} + template class CSplitOpCUDAKernel : public framework::OpKernel { public: @@ -47,24 +75,25 @@ class CSplitOpCUDAKernel : public framework::OpKernel { rank, nranks)); auto& dev_ctx = ctx.template device_context(); - std::vector shape_refer; - std::vector results; - size_t numel = x->numel(); auto dims = x->dims(); - numel /= nranks; - int axis = dims.size() - 1; - dims[dims.size() - 1] /= nranks; - for (int i = 0; i < nranks; i++) { - framework::Tensor* out = new framework::Tensor(); - out->mutable_data(dims, place); - shape_refer.emplace_back(out); - results.emplace_back(out); - } + auto dims_size = dims.size(); + // final dim + int64_t end_size = dims[dims_size - 1]; - math::SplitFunctor functor; - functor(dev_ctx, *x, shape_refer, axis, &results); + // remain dim + auto remain_ddim = framework::slice_ddim(dims, 0, dims_size - 1); + int64_t remain_numel = framework::product(remain_ddim); + + int limit = x->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + + dims[dims_size - 1] /= nranks; out->mutable_data(dims, place); - paddle::framework::TensorCopySync(*results[rank], out->place(), out); + + SplitFromRank<<>>( + x->data(), out->data(), remain_numel, end_size, rank, nranks, + limit); } }; } // namespace operators diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index 83da712bee90881120ee09fc6fad56f7a6a2615a..71ab25a7b0ff8a490d7de0022f810009a58482d4 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -46,7 +46,7 @@ Call calculation stream synchronization. }; template -class CSyncCalcStreamCudaKernel : public framework::OpKernel { +class CSyncCalcStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) @@ -86,5 +86,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp, ops::CSyncCalcStreamOpMaker); -REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, - ops::CSyncCalcStreamCudaKernel); +REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); + +REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc index 4b1f7bb340178748d302f9ec5a5c987a25dae2e3..45613715b8260c3f38968e5cd91f245cd9f524d5 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc @@ -35,7 +35,7 @@ namespace m = paddle::operators::math; USE_OP(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, NPU); -USE_NO_KERNEL_OP(c_sync_calc_stream); +USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU); template void Compare(f::Scope* scope, const p::DeviceContext& ctx) { diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index e6f6bf53456198c61d8a723d9675f482fd593e42..71fda2cd01c8d6007cab19ebeea365467e8e7a99 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -58,12 +58,11 @@ Call communication stream synchronization. }; template -class CSyncCommStreamCudaKernel : public framework::OpKernel { +class CSyncCommStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto place = ctx.GetPlace(); #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int ring_id = ctx.Attr("ring_id"); auto stream = platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); @@ -75,7 +74,6 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel { #endif #elif defined(PADDLE_WITH_ASCEND_CL) - auto place = ctx.GetPlace(); PADDLE_ENFORCE_EQ(is_npu_place(place), true, platform::errors::PreconditionNotMet( "Sync stream op can run on npu place only for now.")); @@ -99,5 +97,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp, ops::CSyncCommStreamOpMaker); -REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, - ops::CSyncCommStreamCudaKernel); +REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); + +REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc index 3915ec4fa35e8bfbf77095e5afff102d2d924d4d..6c5a6db61483dcd7e3578ded6a12a8a421ca1933 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -43,7 +43,7 @@ namespace p = paddle::platform; namespace m = paddle::operators::math; USE_OP(c_broadcast); -USE_NO_KERNEL_OP(c_sync_comm_stream); +USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU); USE_NO_KERNEL_OP(c_gen_hccl_id); USE_NO_KERNEL_OP(c_comm_init_hccl); USE_OP_DEVICE_KERNEL(c_broadcast, NPU); diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cc b/paddle/fluid/operators/collective/partial_allgather_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..bbe537823474162c53e5e0301c4e3ddaa6594ac8 --- /dev/null +++ b/paddle/fluid/operators/collective/partial_allgather_op.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/partial_allgather_op.h" + +namespace paddle { +namespace operators { + +class PartialAllGatherOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PartialAllGather"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "PartialAllGather"); + int nranks = ctx->Attrs().Get("nranks"); + int rank = ctx->Attrs().Get("rank"); + + PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( + "The value of nranks should be >=2.")); + PADDLE_ENFORCE_EQ( + (rank >= 0 && rank < nranks), true, + platform::errors::InvalidArgument( + "The rank (%d) for partial_allgather op must >=0 and GetInputDim("X"); + ctx->SetOutputDim("Out", dim); + } +}; + +class PartialAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be partial allgather"); + AddOutput("Out", "(Tensor) the allgather result"); + AddAttr("ring_id", "(int default 0) communication ring id.") + .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for all gather.") + .SetDefault("tag"); +#endif + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddAttr("nranks", + "Total trainer count of the distributed training job"); + AddAttr("rank", "Rand of the distributed training job"); + AddComment(R"DOC( +PartialAllGather Operator. +Divide the Input into nranks copies and only use the rank part. +Each rank receives the aggregation of data from all ranks in the order of the ranks. + + +reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(partial_allgather, ops::PartialAllGatherOp, + ops::PartialAllGatherOpMaker); + +REGISTER_OP_CPU_KERNEL(partial_allgather, + ops::PartialAllGatherOpCPUKernel, + ops::PartialAllGatherOpCPUKernel, + ops::PartialAllGatherOpCPUKernel, + ops::PartialAllGatherOpCPUKernel, + ops::PartialAllGatherOpCPUKernel); diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..8c32f8c41bbf25f687c66bb21fd3833f10258210 --- /dev/null +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/partial_allgather_op.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class PartialAllGatherOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + int64_t numel = in->numel(); + ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); + + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); + + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + PADDLE_ENFORCE_EQ(rank, comm->rank(), + platform::errors::InvalidArgument( + "rank: %s should equal to %s", rank, comm->rank())); + PADDLE_ENFORCE_EQ( + (numel % nranks), 0, + platform::errors::InvalidArgument( + "The input numel (%d) must be divisible by nranks(%d)", numel, + nranks)); + + framework::DDim dims = in->dims(); + out->mutable_data(dims, place); + + int64_t send_numel = numel / nranks; + int offset = send_numel * rank; + const T* send_buff = in->data() + offset; + T* recv_buff = out->data(); + + gpuStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + send_buff, recv_buff, send_numel, static_cast(dtype), + comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(partial_allgather, + ops::PartialAllGatherOpCUDAKernel, + ops::PartialAllGatherOpCUDAKernel, + ops::PartialAllGatherOpCUDAKernel, + ops::PartialAllGatherOpCUDAKernel, + ops::PartialAllGatherOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/partial_allgather_op.h b/paddle/fluid/operators/collective/partial_allgather_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a6f0d75471a62547a3bad08a2dfd2a913bc1b1e9 --- /dev/null +++ b/paddle/fluid/operators/collective/partial_allgather_op.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class PartialAllGatherOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support partial_allgather for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/partial_recv_op.cc b/paddle/fluid/operators/collective/partial_recv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..22c723ff7f4e1bacea457f6bea10db55ed50794f --- /dev/null +++ b/paddle/fluid/operators/collective/partial_recv_op.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/partial_recv_op.h" +#include + +namespace paddle { +namespace operators { + +class PartialRecvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PartialRecv"); + int peer = ctx->Attrs().Get("peer"); + int ring_id = ctx->Attrs().Get("ring_id"); + int num = ctx->Attrs().Get("num"); + int id = ctx->Attrs().Get("id"); + auto out_shape = ctx->Attrs().Get>("out_shape"); + + PADDLE_ENFORCE_GE( + peer, 0, + platform::errors::InvalidArgument( + "The peer (%d) for partial_recv op must be non-negative.", peer)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for partial_recv op must be non-negative.", + ring_id)); + PADDLE_ENFORCE_GE(num, 1, + platform::errors::InvalidArgument( + "The num (%d) for partial_send op must >=1", num)); + PADDLE_ENFORCE_EQ( + (id >= 0 && id < num), true, + platform::errors::InvalidArgument( + "The id (%d) for partial_send op must >=0 and SetOutputDim("Out", framework::make_ddim(out_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + int dtype = ctx.Attr("dtype"); + framework::proto::VarType::Type type = + framework::proto::VarType::Type(dtype); + return framework::OpKernelType(type, ctx.GetPlace()); + } +}; + +class PartialRecvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddOutput("Out", "(Tensor) tensor to receive."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr("peer", "(int default 0) rank id for sender.").SetDefault(0); + AddAttr("dtype", "(int default 5('float32')) data type of tensor.") + .SetDefault(5); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif + AddAttr>("out_shape", "shape of the output tensor.") + .SetDefault(std::vector()); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddAttr("num", "(int default 1) The number of Output to be cut.") + .SetDefault(1); + AddAttr("id", + "(int default 0) ID of the part to be recv after Output cut.") + .SetDefault(0); + AddComment(R"DOC( +Recv Operator. +Divide the Output into num copies and only recv the id part. + +Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(partial_recv, ops::PartialRecvOp, + ops::PartialRecvOpMaker); + +REGISTER_OP_CPU_KERNEL(partial_recv, ops::PartialRecvOpCPUKernel, + ops::PartialRecvOpCPUKernel, + ops::PartialRecvOpCPUKernel, + ops::PartialRecvOpCPUKernel, + ops::PartialRecvOpCPUKernel); diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..49eafa5c7c4f5352ac8e2f761a09f40c539075b3 --- /dev/null +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/partial_recv_op.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class PartialRecvOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 + auto out = ctx.Output("Out"); + auto out_dims = out->dims(); + auto numel = out->numel(); + + int rid = ctx.Attr("ring_id"); + int peer = ctx.Attr("peer"); + int data_type = ctx.Attr("dtype"); + int num = ctx.Attr("num"); + int id = ctx.Attr("id"); + framework::proto::VarType::Type type = + framework::proto::VarType::Type(data_type); + + PADDLE_ENFORCE_GE( + rid, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for partial_recv op must be non-negative.", rid)); + PADDLE_ENFORCE_GE( + peer, 0, + platform::errors::InvalidArgument( + "The peer (%d) for partial_recv op must be non-negative.", peer)); + PADDLE_ENFORCE_GE(num, 1, + platform::errors::InvalidArgument( + "The num (%d) for partial_recv op must >=1", num)); + PADDLE_ENFORCE_EQ( + (id >= 0 && id < num), true, + platform::errors::InvalidArgument( + "The id (%d) for partial_recv op must >=0 and ("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + PADDLE_ENFORCE_LT( + peer, comm->nranks(), + platform::errors::InvalidArgument("The value of peer (%d) you set must " + "be less than comm->nranks (%d).", + peer, comm->nranks())); + + out->mutable_data(out_dims, place); + ncclDataType_t dtype = platform::ToNCCLDataType(type); + int recv_numel = numel / num; + int offset = recv_numel * id; + + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::ncclRecv(out->data() + offset, recv_numel, dtype, + peer, comm->comm(), stream)); + VLOG(3) << "rank " << comm->rank() << " recv " << recv_numel + << " from offset[" << offset << "] from " << peer; +#else + PADDLE_THROW(platform::errors::Unavailable( + "PaddlePaddle should be compiled with NCCL and " + "NCCL version >= 2.7.3 is needed.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(partial_recv, ops::PartialRecvOpCUDAKernel, + ops::PartialRecvOpCUDAKernel, + ops::PartialRecvOpCUDAKernel, + ops::PartialRecvOpCUDAKernel, + ops::PartialRecvOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/partial_recv_op.h b/paddle/fluid/operators/collective/partial_recv_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d64fa39939c2d6e85a709874f45977c15b26230a --- /dev/null +++ b/paddle/fluid/operators/collective/partial_recv_op.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class PartialRecvOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support partial_recv for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/partial_send_op.cc b/paddle/fluid/operators/collective/partial_send_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..7689e6ed3b51f457769ddb393aae11906402d6ed --- /dev/null +++ b/paddle/fluid/operators/collective/partial_send_op.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/partial_send_op.h" + +namespace paddle { +namespace operators { + +class PartialSendOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PartialSend"); + int peer = ctx->Attrs().Get("peer"); + int ring_id = ctx->Attrs().Get("ring_id"); + int num = ctx->Attrs().Get("num"); + int id = ctx->Attrs().Get("id"); + + PADDLE_ENFORCE_GE( + peer, 0, + platform::errors::InvalidArgument( + "The peer (%d) for partial_send op must be non-negative.", peer)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for partial_send op must be non-negative.", + ring_id)); + PADDLE_ENFORCE_GE(num, 1, + platform::errors::InvalidArgument( + "The num (%d) for partial_send op must >=1", num)); + PADDLE_ENFORCE_EQ( + (id >= 0 && id < num), true, + platform::errors::InvalidArgument( + "The id (%d) for partial_send op must >=0 and ("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr("peer", "(int default 0) rank id for receiver.").SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddAttr("num", "(int default 1) The number of Input to be cut.") + .SetDefault(1); + AddAttr("id", + "(int default 0) ID of the part to be sent after Input cut.") + .SetDefault(0); + AddComment(R"DOC( +PartialSend Operator. +Divide the Input into num copies and only send the id part. + +Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(partial_send, ops::PartialSendOp, + ops::PartialSendMaker); + +REGISTER_OP_CPU_KERNEL(partial_send, ops::PartialSendOpCPUKernel, + ops::PartialSendOpCPUKernel, + ops::PartialSendOpCPUKernel, + ops::PartialSendOpCPUKernel, + ops::PartialSendOpCPUKernel); diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..2463f208746ed6e40b7474dc47a5f981b8b3e57e --- /dev/null +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/partial_send_op.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class PartialSendCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \ + NCCL_VERSION_CODE >= 2703 + auto x = ctx.Input("X"); + int numel = x->numel(); + int rid = ctx.Attr("ring_id"); + int peer = ctx.Attr("peer"); + int num = ctx.Attr("num"); + int id = ctx.Attr("id"); + + PADDLE_ENFORCE_GE( + rid, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for partial_send op must be non-negative.", rid)); + PADDLE_ENFORCE_GE( + peer, 0, + platform::errors::InvalidArgument( + "The peer (%d) for partial_send op must be non-negative.", peer)); + PADDLE_ENFORCE_GE(num, 1, + platform::errors::InvalidArgument( + "The num (%d) for partial_send op must >=1", num)); + PADDLE_ENFORCE_EQ( + (id >= 0 && id < num), true, + platform::errors::InvalidArgument( + "The id (%d) for partial_send op must >=0 and ("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + PADDLE_ENFORCE_LT( + peer, comm->nranks(), + platform::errors::InvalidArgument("The value of peer (%d) you set must " + "be less than comm->nranks (%d).", + peer, comm->nranks())); + + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + int send_numel = numel / num; + int offset = send_numel * id; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + x->data() + offset, send_numel, dtype, peer, comm->comm(), stream)); + VLOG(3) << "rank " << comm->rank() << " send " << send_numel + << " from offset[" << offset << "] to " << peer; +#else + PADDLE_THROW(platform::errors::Unavailable( + "PaddlePaddle should be compiled with NCCL " + "and NCCL version >= 2.7.3 is needed.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(partial_send, ops::PartialSendCUDAKernel, + ops::PartialSendCUDAKernel, + ops::PartialSendCUDAKernel, + ops::PartialSendCUDAKernel, + ops::PartialSendCUDAKernel); diff --git a/paddle/fluid/operators/collective/partial_send_op.h b/paddle/fluid/operators/collective/partial_send_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7550ac40078c40c12f21c9193fc4244058a3b362 --- /dev/null +++ b/paddle/fluid/operators/collective/partial_send_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class PartialSendOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support partial_send for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc index 69f1f4681a33d68d9a4d0efa09bd33d01834cff6..52a23c50c0e115536c87e479ff1763c8d440d550 100644 --- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -27,10 +27,11 @@ class CRecvOpASCENDKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if defined(PADDLE_WITH_ASCEND_CL) - auto x = ctx.Output("Out"); - void* ptr = reinterpret_cast(const_cast(x->data())); - int numel = x->numel(); - HcclDataType dtype = platform::ToHCCLDataType(x->type()); + auto out = ctx.Output("Out"); + out->mutable_data(out->dims(), ctx.GetPlace()); + void* ptr = reinterpret_cast(const_cast(out->data())); + int numel = out->numel(); + HcclDataType dtype = platform::ToHCCLDataType(out->type()); int ring_id = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); @@ -54,8 +55,10 @@ class CRecvOpASCENDKernel : public framework::OpKernel { int root = peer; VLOG(3) << "begin hccl recv, parameter is: " - << "root " << root << ", comm: " << comm->comm() - << ", stream: " << stream; + << "ring_id:" << ring_id << ", nranks:" << nranks + << ", peer:" << peer << ", numel:" << numel << ", ptr:" << ptr + << ", dtype:" << dtype << ", root:" << root + << ", comm: " << comm->comm() << ", stream: " << stream; PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); diff --git a/paddle/fluid/operators/compat/affine_channel.pbtxt b/paddle/fluid/operators/compat/affine_channel.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..83a55ab3a7d19fabc4c176ee9e434895c76e2484 --- /dev/null +++ b/paddle/fluid/operators/compat/affine_channel.pbtxt @@ -0,0 +1,19 @@ +type: "affine_channel" +def { + inputs { + name: "X" + } + inputs { + name: "Scale" + } + inputs { + name: "Bias" + } + attrs { + name: "data_layout" + type: STRING + } + outputs { + name: "Out" + } +} diff --git a/paddle/fluid/operators/compat/batch_norm.pbtxt b/paddle/fluid/operators/compat/batch_norm.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..4bfd08421327fd8a1ae127eb23446ab780dd11fc --- /dev/null +++ b/paddle/fluid/operators/compat/batch_norm.pbtxt @@ -0,0 +1,70 @@ +type: "batch_norm" +def { + inputs { + name: "X" + } + inputs { + name: "Scale" + } + inputs { + name: "Bias" + } + inputs { + name: "Mean" + } + inputs { + name: "Variance" + } + inputs { + name: "MomentumTensor" + } + outputs { + name: "Y" + } + outputs { + name: "MeanOut" + } + outputs { + name: "VarianceOut" + } + outputs { + name: "SavedMean" + } + outputs { + name: "SavedVariance" + } + outputs { + name: "ReserveSpace" + } + attrs { + name: "epsilon" + type: FLOAT + } +} +extra { + attrs { + name: "momentum" + type: FLOAT + } + attrs { + name: "Y0_threshold" + type: FLOAT + } + attrs { + name: "data_layout" + type: STRING + } + attrs { + name: "fuse_with_relu" + type: BOOLEAN + } + attrs { + name: "use_global_stats" + type: BOOLEAN + } + attrs { + name: "trainable_statistics" + type: BOOLEAN + } +} + diff --git a/paddle/fluid/operators/compat/concat.pbtxt b/paddle/fluid/operators/compat/concat.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..24e62fc30a913f9df84bf6ae94ef4b8b4a663562 --- /dev/null +++ b/paddle/fluid/operators/compat/concat.pbtxt @@ -0,0 +1,16 @@ +type: "concat" +def { + inputs { + name: "X" + } + inputs { + name: "AxisTensor" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INT + } +} diff --git a/paddle/fluid/operators/compat/conv2d.pbtxt b/paddle/fluid/operators/compat/conv2d.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..ca07d4a36ff3cafd88d833287b7bf5b17ec81f4d --- /dev/null +++ b/paddle/fluid/operators/compat/conv2d.pbtxt @@ -0,0 +1,137 @@ +type: "conv2d" +def { + inputs { + name: "Input" + } + inputs { + name: "Filter" + } + inputs { + name: "Bias" + } + inputs { + name: "ResidualData" + } + outputs { + name: "Output" + } + attrs { + name: "strides" + type: INTS + } + attrs { + name: "paddings" + type: INTS + } + attrs { + name: "padding_algorithm" + type: STRING + } + attrs { + name: "groups" + type: INT + } + attrs { + name: "dilations" + type: INTS + } + attrs { + name: "data_format" + type: STRING + } +} +extra { + attrs { + name: "Input_scale" + type: FLOAT + } + attrs { + name: "Input0_threshold" + type: FLOAT + } + attrs { + name: "weight_scale" + type: FLOAT + } + attrs { + name: "quantization_type" + type: STRING + } + attrs { + name: "bit_length" + type: INT + } + attrs { + name: "out_threshold" + type: FLOAT + } + attrs { + name: "skip_quant" + type: BOOLEAN + } + attrs { + name: "fuse_relu_before_depthwise_conv" + type: BOOLEAN + } + attrs { + name: "fuse_relu" + type: BOOLEAN + } + attrs { + name: "fuse_brelu" + type: BOOLEAN + } + attrs { + name: "fuse_brelu_threshold" + type: FLOAT + } + attrs { + name: "fuse_activation" + type: STRING + } + attrs { + name: "fuse_alpha" + type: FLOAT + } + attrs { + name: "fuse_beta" + type: FLOAT + } + attrs { + name: "use_addto" + type: BOOLEAN + } + attrs { + name: "fuse_residual_connection" + type: BOOLEAN + } + attrs { + name: "Scale_in" + type: FLOAT + } + attrs { + name: "Scale_out" + type: FLOAT + } + attrs { + name: "Scale_in_eltwise" + type: FLOAT + } + attrs { + name: "Scale_weights" + type: FLOATS + } + attrs { + name: "force_fp32_output" + type: BOOLEAN + } + attrs { + name: "workspace_size_MB" + type: INT + } + attrs { + name: "exhaustive_search" + type: BOOLEAN + } +} + diff --git a/paddle/fluid/operators/compat/conv2d_transpose.pbtxt b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..0654907934025607c393f5ea8dc95375679a03ab --- /dev/null +++ b/paddle/fluid/operators/compat/conv2d_transpose.pbtxt @@ -0,0 +1,74 @@ +type: "conv2d_transpose" +def { + inputs { + name: "Input" + } + inputs { + name: "Filter" + } + inputs { + name: "Bias" + } + outputs { + name: "Output" + } + attrs { + name: "output_padding" + type: INTS + } + attrs { + name: "output_size" + type: INTS + } + attrs { + name: "groups" + type: INT + } + attrs { + name: "dilations" + type: INTS + } + attrs { + name: "strides" + type: INTS + } + attrs { + name: "paddings" + type: INTS + } + attrs { + name: "padding_algorithm" + type: STRING + } + attrs { + name: "data_format" + type: STRING + } +} +extra { + attrs { + name: "force_fp32_output" + type: BOOLEAN + } + attrs { + name: "fuse_relu" + type: BOOLEAN + } + attrs { + name: "fuse_activation" + type: STRING + } + attrs { + name: "fuse_alpha" + type: FLOAT + } + attrs { + name: "fuse_beta" + type: FLOAT + } + attrs { + name: "workspace_size_MB" + type: INT + } +} + diff --git a/paddle/fluid/operators/compat/conv3d.pbtxt b/paddle/fluid/operators/compat/conv3d.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..ec88172faabc1447669b23edf04475df1ca6f07c --- /dev/null +++ b/paddle/fluid/operators/compat/conv3d.pbtxt @@ -0,0 +1,82 @@ +type: "conv3d" +def { + inputs { + name: "Input" + } + inputs { + name: "Filter" + } + inputs { + name: "ResidualData" + } + outputs { + name: "Output" + } + attrs { + name: "strides" + type: INTS + } + attrs { + name: "paddings" + type: INTS + } + attrs { + name: "padding_algorithm" + type: STRING + } + attrs { + name: "groups" + type: INT + } + attrs { + name: "dilations" + type: INTS + } + attrs { + name: "data_format" + type: STRING + } +} +extra { + attrs { + name: "fuse_relu_before_depthwise_conv" + type: BOOLEAN + } + attrs { + name: "fuse_relu" + type: BOOLEAN + } + attrs { + name: "fuse_activation" + type: STRING + } + attrs { + name: "fuse_alpha" + type: FLOAT + } + attrs { + name: "fuse_beta" + type: FLOAT + } + attrs { + name: "use_addto" + type: BOOLEAN + } + attrs { + name: "fuse_residual_connection" + type: BOOLEAN + } + attrs { + name: "force_fp32_output" + type: BOOLEAN + } + attrs { + name: "workspace_size_MB" + type: INT + } + attrs { + name: "exhaustive_search" + type: BOOLEAN + } +} + diff --git a/paddle/fluid/operators/compat/cvm.pbtxt b/paddle/fluid/operators/compat/cvm.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..f94e6d276c328bb77a6a64935fede5060242478f --- /dev/null +++ b/paddle/fluid/operators/compat/cvm.pbtxt @@ -0,0 +1,17 @@ +type: "cvm" +def { + inputs { + name: "X" + } + inputs { + name: "CVM" + } + outputs { + name: "Y" + } + attrs { + name: "use_cvm" + type: BOOLEAN + } +} + diff --git a/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..ded143986159fdc973e345092543b4080366ad0c --- /dev/null +++ b/paddle/fluid/operators/compat/depthwise_conv2d.pbtxt @@ -0,0 +1,129 @@ +type: "depthwise_conv2d" +def { + inputs { + name: "Input" + } + inputs { + name: "Filter" + } + inputs { + name: "Bias" + } + inputs { + name: "ResidualData" + } + outputs { + name: "Output" + } + attrs { + name: "strides" + type: INTS + } + attrs { + name: "paddings" + type: INTS + } + attrs { + name: "padding_algorithm" + type: STRING + } + attrs { + name: "groups" + type: INT + } + attrs { + name: "dilations" + type: INTS + } + attrs { + name: "data_format" + type: STRING + } +} +extra { + attrs { + name: "Input_scale" + type: FLOAT + } + attrs { + name: "quantization_type" + type: STRING + } + attrs { + name: "bit_length" + type: INT + } + attrs { + name: "out_threshold" + type: FLOAT + } + attrs { + name: "skip_quant" + type: BOOLEAN + } + attrs { + name: "fuse_relu_before_depthwise_conv" + type: BOOLEAN + } + attrs { + name: "fuse_relu" + type: BOOLEAN + } + attrs { + name: "fuse_brelu" + type: BOOLEAN + } + attrs { + name: "fuse_brelu_threshold" + type: FLOAT + } + attrs { + name: "fuse_activation" + type: STRING + } + attrs { + name: "fuse_alpha" + type: FLOAT + } + attrs { + name: "fuse_beta" + type: FLOAT + } + attrs { + name: "use_addto" + type: BOOLEAN + } + attrs { + name: "fuse_residual_connection" + type: BOOLEAN + } + attrs { + name: "Scale_in" + type: FLOAT + } + attrs { + name: "Scale_out" + type: FLOAT + } + attrs { + name: "Scale_in_eltwise" + type: FLOAT + } + attrs { + name: "Scale_weights" + type: FLOATS + } + attrs { + name: "force_fp32_output" + type: BOOLEAN + } + attrs { + name: "workspace_size_MB" + type: INT + } + attrs { + name: "exhaustive_search" + type: BOOLEAN + } +} + diff --git a/paddle/fluid/operators/compat/elementwise_add.pbtxt b/paddle/fluid/operators/compat/elementwise_add.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..5b55f3981c77daa5e81590ee499f3d252a34c122 --- /dev/null +++ b/paddle/fluid/operators/compat/elementwise_add.pbtxt @@ -0,0 +1,46 @@ +type: "elementwise_add" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INT + } +} +extra { + attrs { + name: "out_threshold" + type: FLOAT + } + attrs { + name: "Out0_threshold" + type: FLOAT + } + attrs { + name: "x_data_format" + type: STRING + } + attrs { + name: "y_data_format" + type: STRING + } + attrs { + name: "Scale_x" + type: FLOAT + } + attrs { + name: "Scale_y" + type: FLOAT + } + attrs { + name: "Scale_out" + type: FLOAT + } +} diff --git a/paddle/fluid/operators/compat/elementwise_div.pbtxt b/paddle/fluid/operators/compat/elementwise_div.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..a73d2072029a3a00c19f35dfd72adce739fc2f5e --- /dev/null +++ b/paddle/fluid/operators/compat/elementwise_div.pbtxt @@ -0,0 +1,42 @@ +type: "elementwise_div" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INT + } +} +extra { + attrs { + name: "x_data_format" + type: STRING + } + attrs { + name: "y_data_format" + type: STRING + } + attrs { + name: "Scale_x" + type: FLOAT + } + attrs { + name: "Scale_y" + type: FLOAT + } + attrs { + name: "Scale_out" + type: FLOAT + } + attrs { + name: "act" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/elementwise_mul.pbtxt b/paddle/fluid/operators/compat/elementwise_mul.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..22289e2689c10a7339a0e34a38f90b40c9453588 --- /dev/null +++ b/paddle/fluid/operators/compat/elementwise_mul.pbtxt @@ -0,0 +1,38 @@ +type: "elementwise_mul" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INT + } +} +extra { + attrs { + name: "x_data_format" + type: STRING + } + attrs { + name: "y_data_format" + type: STRING + } + attrs { + name: "Scale_x" + type: FLOAT + } + attrs { + name: "Scale_y" + type: FLOAT + } + attrs { + name: "Scale_out" + type: FLOAT + } +} diff --git a/paddle/fluid/operators/compat/elementwise_pow.pbtxt b/paddle/fluid/operators/compat/elementwise_pow.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..a2ab73f409b7801d375b25c90e24ed1f65ed82f0 --- /dev/null +++ b/paddle/fluid/operators/compat/elementwise_pow.pbtxt @@ -0,0 +1,42 @@ +type: "elementwise_pow" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INT + } +} +extra { + attrs { + name: "x_data_format" + type: STRING + } + attrs { + name: "y_data_format" + type: STRING + } + attrs { + name: "Scale_x" + type: FLOAT + } + attrs { + name: "Scale_y" + type: FLOAT + } + attrs { + name: "Scale_out" + type: FLOAT + } + attrs { + name: "act" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/elementwise_sub.pbtxt b/paddle/fluid/operators/compat/elementwise_sub.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..9f38601f585ea8aabdf67fbd2fc9f9189a3f21a0 --- /dev/null +++ b/paddle/fluid/operators/compat/elementwise_sub.pbtxt @@ -0,0 +1,42 @@ +type: "elementwise_sub" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INT + } +} +extra { + attrs { + name: "x_data_format" + type: STRING + } + attrs { + name: "y_data_format" + type: STRING + } + attrs { + name: "Scale_x" + type: FLOAT + } + attrs { + name: "Scale_y" + type: FLOAT + } + attrs { + name: "Scale_out" + type: FLOAT + } + attrs { + name: "act" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt b/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..ec80ffaaf32ae10b4d340b320e06cf24d72f21e5 --- /dev/null +++ b/paddle/fluid/operators/compat/fake_channel_wise_dequantize_max_abs.pbtxt @@ -0,0 +1,20 @@ +type: "fake_channel_wise_dequantize_max_abs" +def { + inputs { + name: "X" + } + inputs { + name: "Scales" + } + outputs { + name: "Out" + } + attrs { + name: "quant_bits" + type: INTS + } + attrs { + name: "quant_axis" + type: INT + } +} diff --git a/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..04fa10cc2b3d1671c4b32afa4d659b86a191f00e --- /dev/null +++ b/paddle/fluid/operators/compat/fake_channel_wise_quantize_abs_max.pbtxt @@ -0,0 +1,20 @@ +type: "fake_channel_wise_quantize_abs_max" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "OutScale" + } + attrs { + name: "quant_axis" + type: INT + } + attrs { + name: "bit_length" + type: INT + } +} diff --git a/paddle/fluid/operators/compat/fake_channel_wise_quantize_dequantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_channel_wise_quantize_dequantize_abs_max.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..7c49da93e71836032f2eb8f784def337d27b4d4d --- /dev/null +++ b/paddle/fluid/operators/compat/fake_channel_wise_quantize_dequantize_abs_max.pbtxt @@ -0,0 +1,46 @@ +type: "fake_channel_wise_quantize_dequantize_abs_max" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "OutScale" + } + attrs { + name: "quant_axis" + type: INT + } + attrs { + name: "bit_length" + type: INT + } +} +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt b/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..0a55c0e44862ce8aba6fbe07dfad73382266c426 --- /dev/null +++ b/paddle/fluid/operators/compat/fake_dequantize_max_abs.pbtxt @@ -0,0 +1,38 @@ +type: "fake_dequantize_max_abs" +def { + inputs { + name: "X" + } + inputs { + name: "Scale" + } + outputs { + name: "Out" + } + attrs { + name: "max_range" + type: FLOAT + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..92ee54eb94c0e1da2d2069f722fded5c5b9ba66d --- /dev/null +++ b/paddle/fluid/operators/compat/fake_quantize_abs_max.pbtxt @@ -0,0 +1,38 @@ +type: "fake_quantize_abs_max" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "OutScale" + } + attrs { + name: "bit_length" + type: INT + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fake_quantize_dequantize_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_dequantize_abs_max.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..bebb397e20bbe7dd31e4b374621c55b49b48b38e --- /dev/null +++ b/paddle/fluid/operators/compat/fake_quantize_dequantize_abs_max.pbtxt @@ -0,0 +1,38 @@ +type: "fake_quantize_dequantize_abs_max" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "OutScale" + } + attrs { + name: "bit_length" + type: INT + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..dddb58f827ea036133649c5fb8a79869ed20f38b --- /dev/null +++ b/paddle/fluid/operators/compat/fake_quantize_moving_average_abs_max.pbtxt @@ -0,0 +1,61 @@ +type: "fake_quantize_moving_average_abs_max" +def { + inputs { + name: "X" + } + inputs { + name: "InScale" + } + inputs { + name: "InAccum" + } + inputs { + name: "InState" + } + outputs { + name: "Out" + } + outputs { + name: "OutScale" + } + outputs { + name: "OutState" + } + outputs { + name: "OutAccum" + } + attrs { + name: "moving_rate" + type: FLOAT + } + attrs { + name: "bit_length" + type: INT + } +} +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt b/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..1050b724ee6b44e44945309b06c6bde6cda18631 --- /dev/null +++ b/paddle/fluid/operators/compat/fake_quantize_range_abs_max.pbtxt @@ -0,0 +1,55 @@ +type: "fake_quantize_range_abs_max" +def { + inputs { + name: "X" + } + inputs { + name: "InScale" + } + inputs { + name: "Iter" + } + outputs { + name: "Out" + } + outputs { + name: "OutScale" + } + outputs { + name: "OutScales" + } + attrs { + name: "window_size" + type: INT + } + attrs { + name: "bit_length" + type: INT + } +} +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fc.pbtxt b/paddle/fluid/operators/compat/fc.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..55e1a22ce4da5f936487b0d2517ec2c76f0f8e5b --- /dev/null +++ b/paddle/fluid/operators/compat/fc.pbtxt @@ -0,0 +1,97 @@ +type: "fc" +def { + inputs { + name: "Input" + } + inputs { + name: "W" + } + inputs { + name: "Bias" + } + outputs { + name: "Out" + } + attrs { + name: "in_num_col_dims" + type: INT + } + attrs { + name: "activation_type" + type: STRING + } +} +extra { + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "padding_weights" + type: BOOLEAN + } + attrs { + name: "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@" + type: BOOLEAN + } + attrs { + name: "use_quantizer" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "weight_scale" + type: FLOATS + } + attrs { + name: "Input_scale" + type: FLOAT + } + attrs { + name: "out_scale" + type: FLOAT + } + attrs { + name: "out_threshold" + type: FLOAT + } + attrs { + name: "force_fp32_output" + type: BOOLEAN + } + attrs { + name: "enable_int8" + type: BOOLEAN + } + attrs { + name: "use_fc_padding" + type: BOOLEAN + } + attrs { + name: "use_gpu" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/fill_constant.pbtxt b/paddle/fluid/operators/compat/fill_constant.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..26fecf623c19cd294d86c5e37c91e7732cd5a1a5 --- /dev/null +++ b/paddle/fluid/operators/compat/fill_constant.pbtxt @@ -0,0 +1,61 @@ +type: "fill_constant" +def { + inputs { + name: "ValueTensor" + } + inputs { + name: "ShapeTensor" + } + inputs { + name: "ShapeTensorList" + } + outputs { + name: "Out" + } + attrs { + name: "dtype" + type: INT + } + attrs { + name: "shape" + type: LONGS + } + attrs { + name: "value" + type: FLOAT + } +} +extra { + attrs { + name: "str_value" + type: STRING + } + attrs { + name: "force_cpu" + type: BOOLEAN + } + attrs { + name: "place_type" + type: INT + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/flatten2.pbtxt b/paddle/fluid/operators/compat/flatten2.pbtxt new file mode 100755 index 0000000000000000000000000000000000000000..6b8a6661a6fd7d66d9a16ee64cefce8bccb374f4 --- /dev/null +++ b/paddle/fluid/operators/compat/flatten2.pbtxt @@ -0,0 +1,38 @@ +type: "flatten2" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "XShape" + } + attrs { + name: "axis" + type: INT + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/gru.pbtxt b/paddle/fluid/operators/compat/gru.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..38aa8a92f75bd92801333cadbc0d1c1b1068c790 --- /dev/null +++ b/paddle/fluid/operators/compat/gru.pbtxt @@ -0,0 +1,65 @@ +type: "gru" +def { + inputs { + name: "Input" + } + inputs { + name: "H0" + } + inputs { + name: "Weight" + } + inputs { + name: "Bias" + } + outputs { + name: "BatchGate" + } + outputs { + name: "BatchResetHiddenPrev" + } + outputs { + name: "BatchHidden" + } + outputs { + name: "Hidden" + } + attrs { + name: "activation" + type: STRING + } + attrs { + name: "gate_activation" + type: STRING + } + attrs { + name: "is_reverse" + type: BOOLEAN + } + attrs { + name: "origin_mode" + type: BOOLEAN + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/hard_swish.pbtxt b/paddle/fluid/operators/compat/hard_swish.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..9951513741a61a8245296fe378b02aced3c17793 --- /dev/null +++ b/paddle/fluid/operators/compat/hard_swish.pbtxt @@ -0,0 +1,56 @@ +type: "hard_swish" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "threshold" + type: FLOAT + } + attrs { + name: "scale" + type: FLOAT + } + attrs { + name: "offset" + type: FLOAT + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "name" + type: STRING + } + attrs { + name: "@ENABLE_CACHE_RUNTIME_CONTEXT@" + type: BOOLEAN + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} + diff --git a/paddle/fluid/operators/compat/layer_norm.pbtxt b/paddle/fluid/operators/compat/layer_norm.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..dbb78e0a8baa1efae2efdf66a8520fcc9a505b65 --- /dev/null +++ b/paddle/fluid/operators/compat/layer_norm.pbtxt @@ -0,0 +1,63 @@ +type: "layer_norm" +def { + inputs { + name: "X" + } + inputs { + name: "Scale" + } + inputs { + name: "Bias" + } + outputs { + name: "Y" + } + outputs { + name: "Mean" + } + outputs { + name: "Variance" + } + attrs { + name: "epsilon" + type: FLOAT + } + attrs { + name: "begin_norm_axis" + type: INT + } +} +extra { + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/leaky_relu.pbtxt b/paddle/fluid/operators/compat/leaky_relu.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..8618b72ca87485480b0f46d3091b32d6bb39611b --- /dev/null +++ b/paddle/fluid/operators/compat/leaky_relu.pbtxt @@ -0,0 +1,52 @@ +type: "leaky_relu" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "alpha" + type: FLOAT + } +} +extra { + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "name" + type: STRING + } + attrs { + name: "@ENABLE_CACHE_RUNTIME_CONTEXT@" + type: BOOLEAN + } + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} + diff --git a/paddle/fluid/operators/compat/lstm.pbtxt b/paddle/fluid/operators/compat/lstm.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..889911a8408cb0f9c3b48b856900383194d1c884 --- /dev/null +++ b/paddle/fluid/operators/compat/lstm.pbtxt @@ -0,0 +1,72 @@ +type: "lstm" +def { + inputs { + name: "Input" + } + inputs { + name: "H0" + } + inputs { + name: "C0" + } + inputs { + name: "Weight" + } + inputs { + name: "Bias" + } + outputs { + name: "Hidden" + } + outputs { + name: "Cell" + } + outputs { + name: "BatchGate" + } + outputs { + name: "BatchCellPreAct" + } + attrs { + name: "use_peepholes" + type: BOOLEAN + } + attrs { + name: "is_reverse" + type: BOOLEAN + } + attrs { + name: "gate_activation" + type: STRING + } + attrs { + name: "cell_activation" + type: STRING + } + attrs { + name: "candidate_activation" + type: STRING + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/matmul.pbtxt b/paddle/fluid/operators/compat/matmul.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..8f29d93660608928a21dbb96e16b7a579fa3aa63 --- /dev/null +++ b/paddle/fluid/operators/compat/matmul.pbtxt @@ -0,0 +1,102 @@ +type: "matmul" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "alpha" + type: FLOAT + } + attrs { + name: "transpose_X" + type: BOOLEAN + } + attrs { + name: "transpose_Y" + type: BOOLEAN + } +} +extra { + attrs { + name: "head_number" + type: INT + } + attrs { + name: "Scale_out" + type: FLOAT + } + attrs { + name: "Scale_x" + type: FLOAT + } + attrs { + name: "Scale_y" + type: FLOAT + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } + attrs { + name: "use_quantizer" + type: BOOLEAN + } + attrs { + name: "force_fp32_output" + type: BOOLEAN + } + attrs { + name: "fused_reshape_Out" + type: INTS + } + attrs { + name: "fused_reshape_X" + type: INTS + } + attrs { + name: "fused_reshape_Y" + type: INTS + } + attrs { + name: "fused_transpose_Out" + type: INTS + } + attrs { + name: "fused_transpose_X" + type: INTS + } + attrs { + name: "fused_transpose_Y" + type: INTS + } +} diff --git a/paddle/fluid/operators/compat/matmul_v2.pbtxt b/paddle/fluid/operators/compat/matmul_v2.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..5f43e1f8bf0e0c502566a2cc783b8927e5df56cc --- /dev/null +++ b/paddle/fluid/operators/compat/matmul_v2.pbtxt @@ -0,0 +1,42 @@ +type: "matmul_v2" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "trans_x" + type: BOOLEAN + } + attrs { + name: "trans_y" + type: BOOLEAN + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/mul.pbtxt b/paddle/fluid/operators/compat/mul.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..617775eaaae9e7b7754fb3e19323063e3d5f20db --- /dev/null +++ b/paddle/fluid/operators/compat/mul.pbtxt @@ -0,0 +1,99 @@ +type: "mul" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "x_num_col_dims" + type: INT + } + attrs { + name: "y_num_col_dims" + type: INT + } +} +extra { + attrs { + name: "Out0_threshold" + type: FLOAT + } + attrs { + name: "bit_length" + type: INT + } + attrs { + name: "quantization_type" + type: STRING + } + attrs { + name: "skip_quant" + type: BOOLEAN + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "scale_x" + type: FLOAT + } + attrs { + name: "scale_y" + type: FLOATS + } + attrs { + name: "scale_out" + type: FLOAT + } + attrs { + name: "force_fp32_output" + type: BOOLEAN + } + attrs { + name: "enable_int8" + type: BOOLEAN + } + attrs { + name: "X_scale" + type: FLOAT + } + attrs { + name: "weight_scale" + type: FLOAT + } + attrs { + name: "out_scale" + type: FLOAT + } + attrs { + name: "out_threshold" + type: FLOAT + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } + +} diff --git a/paddle/fluid/operators/compat/pool2d.pbtxt b/paddle/fluid/operators/compat/pool2d.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..1620d1ef1c649ab8a90307241ae8956b62ceee52 --- /dev/null +++ b/paddle/fluid/operators/compat/pool2d.pbtxt @@ -0,0 +1,92 @@ +type: "pool2d" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "pooling_type" + type: STRING + } + attrs { + name: "ksize" + type: INTS + } + attrs { + name: "global_pooling" + type: BOOLEAN + } + attrs { + name: "strides" + type: INTS + } + attrs { + name: "paddings" + type: INTS + } + attrs { + name: "exclusive" + type: BOOLEAN + } + attrs { + name: "adaptive" + type: BOOLEAN + } + attrs { + name: "ceil_mode" + type: BOOLEAN + } + attrs { + name: "data_format" + type: STRING + } + attrs { + name: "padding_algorithm" + type: STRING + } +} +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "use_cudnn" + type: BOOLEAN + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "use_quantizer" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} + diff --git a/paddle/fluid/operators/compat/reduce_mean.pbtxt b/paddle/fluid/operators/compat/reduce_mean.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..eea6ad127fd4520b30ca8dc7222fca425ba399da --- /dev/null +++ b/paddle/fluid/operators/compat/reduce_mean.pbtxt @@ -0,0 +1,55 @@ +type: "reduce_mean" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "dim" + type: INTS + } + attrs { + name: "keep_dim" + type: BOOLEAN + } +} +extra { + attrs { + name: "reduce_all" + type: BOOLEAN + } + attrs { + name: "in_dtype" + type: INT + } + attrs { + name: "out_dtype" + type: INT + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/relu.pbtxt b/paddle/fluid/operators/compat/relu.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..a3dc65ae35c008fe9e5838f16bc28d71e0eff811 --- /dev/null +++ b/paddle/fluid/operators/compat/relu.pbtxt @@ -0,0 +1,27 @@ +type: "relu" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } +} +extra { + attrs { + name: "X0_threshold" + type: FLOAT + } + attrs { + name: "X0_threshold" + type: FLOAT + } + attrs { + name: "out_threshold" + type: FLOAT + } + attrs { + name: "Out0_threshold" + type: FLOAT + } +} diff --git a/paddle/fluid/operators/compat/relu6.pbtxt b/paddle/fluid/operators/compat/relu6.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..340b13020144a83edc4b26fdee8ec33e2c8cbb15 --- /dev/null +++ b/paddle/fluid/operators/compat/relu6.pbtxt @@ -0,0 +1,52 @@ +type: "relu6" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "threshold" + type: FLOAT + } +} +extra { + attrs { + name: "name" + type: STRING + } + attrs { + name: "is_test" + type: FLOAT + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "@ENABLE_CACHE_RUNTIME_CONTEXT@" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} + diff --git a/paddle/fluid/operators/compat/reshape2.pbtxt b/paddle/fluid/operators/compat/reshape2.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..d975aed61fa1b7a4f2aba08d353d042d21c2dccb --- /dev/null +++ b/paddle/fluid/operators/compat/reshape2.pbtxt @@ -0,0 +1,52 @@ +type: "reshape2" +def { + inputs { + name: "X" + } + inputs { + name: "Shape" + } + inputs { + name: "ShapeTensor" + } + outputs { + name: "XShape" + } + outputs { + name: "Out" + } + attrs { + name: "shape" + type: INTS + } +} +extra { + attrs { + name: "use_quantizer" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/scale.pbtxt b/paddle/fluid/operators/compat/scale.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..4667b20d6ab56578404062f1e71ebe9d0b7a9868 --- /dev/null +++ b/paddle/fluid/operators/compat/scale.pbtxt @@ -0,0 +1,51 @@ +type: "scale" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "bias" + type: FLOAT + } + attrs { + name: "scale" + type: FLOAT + } + attrs { + name: "bias_after_scale" + type: BOOLEAN + } +} +extra { + attrs { + name: "name" + type: STRING + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/sequence_conv.pbtxt b/paddle/fluid/operators/compat/sequence_conv.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..c5335a25c557a7ee904cbb805735a63d1465ebd5 --- /dev/null +++ b/paddle/fluid/operators/compat/sequence_conv.pbtxt @@ -0,0 +1,53 @@ +type: "sequence_conv" +def { + inputs { + name: "X" + } + inputs { + name: "Filter" + } + inputs { + name: "PaddingData" + } + outputs { + name: "Out" + } + attrs { + name: "contextLength" + type: INT + } + attrs { + name: "contextStart" + type: INT + } + attrs { + name: "contextStride" + type: INT + } +} +extra { + attrs { + name: "paddingTrainable" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/sequence_expand.pbtxt b/paddle/fluid/operators/compat/sequence_expand.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..38169d7b57ded849af1886828f4ae18fd2b7841d --- /dev/null +++ b/paddle/fluid/operators/compat/sequence_expand.pbtxt @@ -0,0 +1,38 @@ +type: "sequence_expand" +def { + inputs { + name: "X" + } + inputs { + name: "Y" + } + outputs { + name: "Out" + } + attrs { + name: "ref_level" + type: INT + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/sequence_pool.pbtxt b/paddle/fluid/operators/compat/sequence_pool.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..c45f457fe0d9ff1e4b5e9589662590333aac16e3 --- /dev/null +++ b/paddle/fluid/operators/compat/sequence_pool.pbtxt @@ -0,0 +1,47 @@ +type: "sequence_pool" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "MaxIndex" + } + attrs { + name: "pooltype" + type: STRING + } + attrs { + name: "pad_value" + type: FLOAT + } +} +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} + diff --git a/paddle/fluid/operators/compat/sigmoid.pbtxt b/paddle/fluid/operators/compat/sigmoid.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..7b53aa402c1183d3f9688cc8528ad42dcd10e1b5 --- /dev/null +++ b/paddle/fluid/operators/compat/sigmoid.pbtxt @@ -0,0 +1,39 @@ +type: "sigmoid" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } +} +extra { + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "use_cudnn" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/softmax.pbtxt b/paddle/fluid/operators/compat/softmax.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..04f15ace15f449ad33357295033c4f61261276da --- /dev/null +++ b/paddle/fluid/operators/compat/softmax.pbtxt @@ -0,0 +1,55 @@ +type: "softmax" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INT + } +} +extra { + attrs { + name: "data_format" + type: STRING + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "use_cudnn" + type: BOOLEAN + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } +} diff --git a/paddle/fluid/operators/compat/sqrt.pbtxt b/paddle/fluid/operators/compat/sqrt.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..2dbcba802a4086e841080399300eb95f8ba1069d --- /dev/null +++ b/paddle/fluid/operators/compat/sqrt.pbtxt @@ -0,0 +1,39 @@ +type: "sqrt" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } +} +extra { + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "use_cudnn" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/square.pbtxt b/paddle/fluid/operators/compat/square.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..1a4f0640bec79a1e1a75026b90113cdef7650b5f --- /dev/null +++ b/paddle/fluid/operators/compat/square.pbtxt @@ -0,0 +1,44 @@ +type: "square" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } +} + +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "use_cudnn" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/squeeze2.pbtxt b/paddle/fluid/operators/compat/squeeze2.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..160e6a7278649408f7c5245eb53029610407ebc0 --- /dev/null +++ b/paddle/fluid/operators/compat/squeeze2.pbtxt @@ -0,0 +1,38 @@ +type: "squeeze2" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "XShape" + } + attrs { + name: "axes" + type: INTS + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/swish.pbtxt b/paddle/fluid/operators/compat/swish.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..1dd8e577d9c738f20f7f6fc038019b1cfca133af --- /dev/null +++ b/paddle/fluid/operators/compat/swish.pbtxt @@ -0,0 +1,44 @@ +type: "swish" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } +} +extra { + attrs { + name: "beta" + type: FLOAT + } + attrs { + name: "name" + type: STRING + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} + diff --git a/paddle/fluid/operators/compat/tanh.pbtxt b/paddle/fluid/operators/compat/tanh.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..a0e6cf8a0a90add80200a524e2721eec00a07751 --- /dev/null +++ b/paddle/fluid/operators/compat/tanh.pbtxt @@ -0,0 +1,39 @@ +type: "tanh" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } +} +extra { + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "use_cudnn" + type: BOOLEAN + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/transpose.pbtxt b/paddle/fluid/operators/compat/transpose.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..1cd04a4da4a174808f81f3b1d5c4f6093b5126ee --- /dev/null +++ b/paddle/fluid/operators/compat/transpose.pbtxt @@ -0,0 +1,52 @@ +type: "transpose" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + attrs { + name: "axis" + type: INTS + } +} +extra { + attrs { + name: "data_format" + type: STRING + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "use_quantizer" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} + diff --git a/paddle/fluid/operators/compat/transpose2.pbtxt b/paddle/fluid/operators/compat/transpose2.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..31aecd24bc911b446b43f351885549be9d84533a --- /dev/null +++ b/paddle/fluid/operators/compat/transpose2.pbtxt @@ -0,0 +1,54 @@ +type: "transpose2" +def { + inputs { + name: "X" + } + outputs { + name: "Out" + } + outputs { + name: "XShape" + } + attrs { + name: "axis" + type: INTS + } +} +extra { + attrs { + name: "data_format" + type: STRING + } + attrs { + name: "use_mkldnn" + type: BOOLEAN + } + attrs { + name: "use_quantizer" + type: BOOLEAN + } + attrs { + name: "mkldnn_data_type" + type: STRING + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/unsqueeze2.pbtxt b/paddle/fluid/operators/compat/unsqueeze2.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..ed3c32754a59f0a30ad4351bdf188d8ae7d68692 --- /dev/null +++ b/paddle/fluid/operators/compat/unsqueeze2.pbtxt @@ -0,0 +1,44 @@ +type: "unsqueeze2" +def { + inputs { + name: "X" + } + inputs { + name: "AxesTensor" + } + inputs { + name: "AxesTensorList" + } + outputs { + name: "Out" + } + outputs { + name: "XShape" + } + attrs { + name: "axes" + type: INTS + } +} +extra { + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/compat/while.pbtxt b/paddle/fluid/operators/compat/while.pbtxt new file mode 100644 index 0000000000000000000000000000000000000000..34435e1d9e5ff383dd1f7fca82ee10b5428b4acd --- /dev/null +++ b/paddle/fluid/operators/compat/while.pbtxt @@ -0,0 +1,49 @@ +type: "while" +def { + inputs { + name: "X" + } + inputs { + name: "Condition" + } + outputs { + name: "Out" + } + outputs { + name: "StepScopes" + } + attrs { + name: "sub_block" + type: BLOCK + } +} +extra { + attrs { + name: "is_test" + type: BOOLEAN + } + attrs { + name: "skip_eager_deletion_vars" + type: STRINGS + } + attrs { + name: "op_role" + type: INT + } + attrs { + name: "op_role_var" + type: STRINGS + } + attrs { + name: "op_namescope" + type: STRING + } + attrs { + name: "op_callstack" + type: STRINGS + } + attrs { + name: "op_device" + type: STRING + } +} diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index bbc42d97146f24e69d2f2337967e129af013fb6c..6095516f92fa529e1d8c8ee21519e839687dcac5 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -233,7 +233,8 @@ REGISTER_OP_CPU_KERNEL( ops::ConcatKernel, ops::ConcatKernel, - ops::ConcatKernel); + ops::ConcatKernel, + ops::ConcatKernel); REGISTER_OP_CPU_KERNEL( concat_grad, ops::ConcatGradKernel, @@ -242,4 +243,5 @@ REGISTER_OP_CPU_KERNEL( ops::ConcatGradKernel, ops::ConcatGradKernel, - ops::ConcatGradKernel); + ops::ConcatGradKernel, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc index 8c30703f2576b35deb419238de08c5f2fa7b42d2..63025c3bd030f2f3917654a0dcc8bf6de8a98425 100644 --- a/paddle/fluid/operators/concat_op.cu.cc +++ b/paddle/fluid/operators/concat_op.cu.cc @@ -23,7 +23,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ConcatKernel, ops::ConcatKernel, ops::ConcatKernel, - ops::ConcatKernel); + ops::ConcatKernel, + ops::ConcatKernel); REGISTER_OP_CUDA_KERNEL( concat_grad, ops::ConcatGradKernel, @@ -31,4 +32,5 @@ REGISTER_OP_CUDA_KERNEL( ops::ConcatGradKernel, ops::ConcatGradKernel, ops::ConcatGradKernel, - ops::ConcatGradKernel); + ops::ConcatGradKernel, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc index 87bb3397ca2672ce377b74682cb0445e31b03677..d242c9f8c3fbd538b3ec0ce95fa5929c7c8ccd0a 100644 --- a/paddle/fluid/operators/concat_op_npu.cc +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -52,9 +52,11 @@ class ConcatNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); - auto runner = NpuOpRunner( - "ConcatD", {inputs}, {*out}, - {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}); + NpuOpRunner runner{ + "ConcatD", + {inputs}, + {*out}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; runner.AddInputNames(names); runner.Run(stream); } @@ -101,8 +103,9 @@ class ConcatGradNPUKernel : public framework::OpKernel { sizes.push_back(ins[j]->dims()[dim]); } } - auto runner = NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, - {{"offsets", offsets}, {"size", sizes}}); + const auto& runner = + NpuOpRunner("SliceD", {*out_grad}, {*outs[j]}, + {{"offsets", offsets}, {"size", sizes}}); runner.Run(stream); } if (ins[j]->numel() != 0UL) { diff --git a/paddle/fluid/operators/conj_op.cc b/paddle/fluid/operators/conj_op.cc index 3afe4f1e3d1027ce37404544dcd0929cc41cb6a3..4d801bc003ea9ac417ff66deda8359f2921e01f6 100644 --- a/paddle/fluid/operators/conj_op.cc +++ b/paddle/fluid/operators/conj_op.cc @@ -78,9 +78,9 @@ REGISTER_OPERATOR(conj, ops::ConjOp, ops::ConjOpMaker, REGISTER_OP_CPU_KERNEL( conj, ops::ConjKernel, + paddle::platform::complex>, ops::ConjKernel, + paddle::platform::complex>, ops::ConjKernel, ops::ConjKernel, ops::ConjKernel, diff --git a/paddle/fluid/operators/conj_op.cu b/paddle/fluid/operators/conj_op.cu index 601caeb50558876b972014813ca6dc247aecfeba..d04024d70a8ea66128010d39c9eb1233d28caf03 100644 --- a/paddle/fluid/operators/conj_op.cu +++ b/paddle/fluid/operators/conj_op.cu @@ -13,15 +13,14 @@ // limitations under the License. #include "paddle/fluid/operators/conj_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( conj, ops::ConjKernel, + paddle::platform::complex>, ops::ConjKernel, + paddle::platform::complex>, ops::ConjKernel, ops::ConjKernel, ops::ConjKernel, diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index e23fb05833c0fa428b4f74785ff947a4c785648e..1a2df2a0c7ba34f67ecb7c2ade002fcb4475229f 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -19,4 +19,6 @@ else() target_link_libraries(conditional_block_infer_op conditional_block_op) endif() -file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") +file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") +file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n") +file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cc b/paddle/fluid/operators/controlflow/bitwise_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..cfe0d99962190aa282b46e212d01df4b718d1305 --- /dev/null +++ b/paddle/fluid/operators/controlflow/bitwise_op.cc @@ -0,0 +1,174 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/bitwise_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class BinaryBitwiseOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + OpComment comment; + AddInput("X", string::Sprintf( + "Input Tensor of ``%s`` . It is " + "a N-D Tensor of bool, uint8, int8, int16, int32, int64.", + comment.type)); + AddInput("Y", string::Sprintf( + "Input Tensor of ``%s`` . It is " + "a N-D Tensor of bool, uint8, int8, int16, int32, int64.", + comment.type)); + AddOutput("Out", + string::Sprintf("Result of ``%s`` . It is a N-D Tensor with " + "the same data type of input Tensor.", + comment.type)); + AddComment(string::Sprintf(R"DOC( +It operates ``%s`` on Tensor ``X`` and ``Y`` . + +.. math:: + %s + +.. note:: + ``paddle.%s`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`. +)DOC", + comment.type, comment.equation, comment.type)); + } +}; + +template +class UnaryBitwiseOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + OpComment comment; + AddInput("X", string::Sprintf( + "Input Tensor of ``%s`` . It is " + "a N-D Tensor of bool, uint8, int8, int16, int32, int64.", + comment.type)); + AddOutput("Out", + string::Sprintf("Result of ``%s`` . It is a N-D Tensor with " + "the same data type of input Tensor.", + comment.type)); + AddComment(string::Sprintf(R"DOC( +It operates ``%s`` on Tensor ``X`` . + +.. math:: + %s + +)DOC", + comment.type, comment.equation)); + } +}; + +class BitwiseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // BitwiseOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + +template +class UnaryBitwiseOp : public BitwiseOp { + public: + using BitwiseOp::BitwiseOp; + + protected: + void InferShape(framework::InferShapeContext *context) const override { + OpComment comment; + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +template +class BinaryBitwiseOp : public BitwiseOp { + public: + using BitwiseOp::BitwiseOp; + + protected: + void InferShape(framework::InferShapeContext *context) const override { + OpComment comment; + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", comment.type); + OP_INOUT_CHECK(context->HasInput("Y"), "Input", "Y", comment.type); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + if (dim_x == dim_y) { + context->SetOutputDim("Out", dim_x); + } else { + int max_dim = std::max(dim_x.size(), dim_y.size()); + int axis = std::abs(dim_x.size() - dim_y.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(dim_x, dim_y, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), + max_dim, axis); + context->SetOutputDim("Out", framework::make_ddim(out_dims_array)); + } + context->ShareLoD("X", "Out"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = ::paddle::operators; + +#define REGISTER_BINARY_BITWISE_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ops::BinaryBitwiseOp<_##op_type##Comment>, \ + ops::BinaryBitwiseOpProtoMaker<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker, \ + ::paddle::framework::EmptyGradOpMaker); + +#define REGISTER_UNARY_BITWISE_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ops::UnaryBitwiseOp<_##op_type##Comment>, \ + ops::UnaryBitwiseOpProtoMaker<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker, \ + ::paddle::framework::EmptyGradOpMaker); + +REGISTER_BINARY_BITWISE_OP(bitwise_and, "Out = X \\& Y"); +REGISTER_BINARY_BITWISE_OP(bitwise_or, "Out = X | Y"); +REGISTER_BINARY_BITWISE_OP(bitwise_xor, "Out = X ^\\wedge Y"); +REGISTER_UNARY_BITWISE_OP(bitwise_not, "Out = \\sim X"); + +REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CPU, ops::BitwiseAndFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CPU, ops::BitwiseOrFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CPU, ops::BitwiseXorFunctor); +REGISTER_UNARY_BITWISE_KERNEL(bitwise_not, CPU, ops::BitwiseNotFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.cu b/paddle/fluid/operators/controlflow/bitwise_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..b549f7e33005e33a2f73e0617beb2a8b12dd1245 --- /dev/null +++ b/paddle/fluid/operators/controlflow/bitwise_op.cu @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/bitwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" + +namespace paddle { +namespace operators { + +#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ + template \ + struct Bitwise##func##CUDAFunctor { \ + using ELEM_TYPE = T; \ + HOSTDEVICE T operator()(const T* args) const { \ + return args[0] expr args[1]; \ + } \ + }; \ + \ + template <> \ + struct Bitwise##func##CUDAFunctor { \ + using ELEM_TYPE = bool; \ + HOSTDEVICE bool operator()(const bool* args) const { \ + return args[0] bool_expr args[1]; \ + } \ + }; + +BITWISE_BINARY_FUNCTOR(And, &, &&) +BITWISE_BINARY_FUNCTOR(Or, |, ||) +BITWISE_BINARY_FUNCTOR(Xor, ^, !=) +#undef BITWISE_BINARY_FUNCTOR + +template +struct BitwiseNotCUDAFunctor { + using ELEM_TYPE = T; + HOSTDEVICE T operator()(const T* args) const { return ~args[0]; } +}; + +template <> +struct BitwiseNotCUDAFunctor { + using ELEM_TYPE = bool; + HOSTDEVICE bool operator()(const bool* args) const { return !args[0]; } +}; + +template +class BinaryBitwiseOpKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEM_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + auto functor = Functor(); + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + + if (ins.size() == 1) { + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, functor); + } else { + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, functor); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = ::paddle::operators; +namespace plat = ::paddle::platform; + +REGISTER_BINARY_BITWISE_KERNEL(bitwise_and, CUDA, ops::BitwiseAndCUDAFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_or, CUDA, ops::BitwiseOrCUDAFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_xor, CUDA, ops::BitwiseXorCUDAFunctor); +REGISTER_BINARY_BITWISE_KERNEL(bitwise_not, CUDA, ops::BitwiseNotCUDAFunctor); diff --git a/paddle/fluid/operators/controlflow/bitwise_op.h b/paddle/fluid/operators/controlflow/bitwise_op.h new file mode 100644 index 0000000000000000000000000000000000000000..92abe4cd3b1c3630ed9c2652f2ff8a49f033f13b --- /dev/null +++ b/paddle/fluid/operators/controlflow/bitwise_op.h @@ -0,0 +1,112 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +#define BITWISE_BINARY_FUNCTOR(func, expr, bool_expr) \ + template \ + struct Bitwise##func##Functor { \ + using ELEM_TYPE = T; \ + HOSTDEVICE T operator()(const T& a, const T& b) const { return a expr b; } \ + }; \ + \ + template <> \ + struct Bitwise##func##Functor { \ + using ELEM_TYPE = bool; \ + HOSTDEVICE bool operator()(const bool& a, const bool& b) const { \ + return a bool_expr b; \ + } \ + }; + +BITWISE_BINARY_FUNCTOR(And, &, &&) +BITWISE_BINARY_FUNCTOR(Or, |, ||) +BITWISE_BINARY_FUNCTOR(Xor, ^, !=) +#undef BITWISE_BINARY_FUNCTOR + +template +struct BitwiseNotFunctor { + using ELEM_TYPE = T; + HOSTDEVICE T operator()(const T& a) const { return ~a; } +}; + +template <> +struct BitwiseNotFunctor { + using ELEM_TYPE = bool; + HOSTDEVICE bool operator()(const bool& a) const { return !a; } +}; + +template +class BinaryBitwiseOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto func = Functor(); + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* out = context.Output("Out"); + ElementwiseComputeEx(context, x, y, -1, func, + out); + } +}; + +template +class UnaryBitwiseOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto func = Functor(); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), out->mutable_data(context.GetPlace()), + func); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = ::paddle::operators; +namespace plat = ::paddle::platform; + +#define REGISTER_BINARY_BITWISE_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, \ + ops::BinaryBitwiseOpKernel>, \ + ops::BinaryBitwiseOpKernel>, \ + ops::BinaryBitwiseOpKernel>, \ + ops::BinaryBitwiseOpKernel>, \ + ops::BinaryBitwiseOpKernel>, \ + ops::BinaryBitwiseOpKernel>); + +#define REGISTER_UNARY_BITWISE_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, \ + ops::UnaryBitwiseOpKernel>, \ + ops::UnaryBitwiseOpKernel>, \ + ops::UnaryBitwiseOpKernel>, \ + ops::UnaryBitwiseOpKernel>, \ + ops::UnaryBitwiseOpKernel>, \ + ops::UnaryBitwiseOpKernel>); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cc b/paddle/fluid/operators/controlflow/compare_all_op.cc index adacf70f5e14548806de80e629a15f915705d749..ede349f737d899e5f04cb5e35d1dbc0c0abc2403 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cc +++ b/paddle/fluid/operators/controlflow/compare_all_op.cc @@ -30,29 +30,13 @@ class CompareReduceOpKernel auto* x = context.Input("X"); auto* y = context.Input("Y"); auto* z = context.Output("Out"); - bool shape_same = true; - Tensor tmp; - framework::DDim x_dims = x->dims(); - framework::DDim y_dims = y->dims(); - - // judge the two inputs shape is same, if not same, just return false - if (x_dims.size() != y_dims.size()) { - shape_same = false; - } else { - for (auto i = 0; i < x_dims.size(); i++) { - if (x_dims[i] != y_dims[i]) { - shape_same = false; - break; - } - } - } - bool* z_data = z->mutable_data(context.GetPlace()); - if (!shape_same) { + + if (x->dims() != y->dims()) { z_data[0] = false; } else { - tmp.mutable_data(x_dims, context.GetPlace()); + tmp.mutable_data(x->dims(), context.GetPlace()); if (x->numel() == 1 && y->numel() == 1) { bool* z_data = tmp.mutable_data(context.GetPlace()); z_data[0] = Functor()(x->data()[0], y->data()[0]); @@ -135,15 +119,17 @@ class CompareReduceOp : public framework::OperatorWithKernel { ::paddle::framework::EmptyGradOpMaker, \ ::paddle::framework::EmptyGradOpMaker); -#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor) \ - REGISTER_OP_CPU_KERNEL( \ - op_type, ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ - ::paddle::platform::CPUDeviceContext, functor>, \ - ::paddle::operators::CompareReduceOpKernel< \ +#define REGISTER_COMPARE_REDUCE_CPU_KERNEL(op_type, functor) \ + REGISTER_OP_CPU_KERNEL( \ + op_type, ::paddle::operators::CompareReduceOpKernel< \ + ::paddle::platform::CPUDeviceContext, functor>, \ + ::paddle::operators::CompareReduceOpKernel< \ + ::paddle::platform::CPUDeviceContext, functor>, \ + ::paddle::operators::CompareReduceOpKernel< \ + ::paddle::platform::CPUDeviceContext, functor>, \ + ::paddle::operators::CompareReduceOpKernel< \ + ::paddle::platform::CPUDeviceContext, functor>, \ + ::paddle::operators::CompareReduceOpKernel< \ ::paddle::platform::CPUDeviceContext, functor>); REGISTER_COMPARE_REDUCE_OP(equal_all, "X == Y"); diff --git a/paddle/fluid/operators/controlflow/compare_all_op.cu b/paddle/fluid/operators/controlflow/compare_all_op.cu index e3c920f78c45b4c96115b8b650f2a08f544bc788..9e22d74d6e2aac97ad23f99ad9d5b6a7f9924bbe 100644 --- a/paddle/fluid/operators/controlflow/compare_all_op.cu +++ b/paddle/fluid/operators/controlflow/compare_all_op.cu @@ -14,14 +14,18 @@ limitations under the License. */ #include #include "paddle/fluid/operators/controlflow/compare_all_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + namespace paddle { namespace operators { template struct IdentityFunctor { HOSTDEVICE explicit inline IdentityFunctor() {} - HOSTDEVICE inline T operator()(const T& x) const { return x; } }; @@ -33,6 +37,24 @@ struct BitwiseAdd { return a & b; } }; + +template +struct CudaEqualReduceFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T args[]) const { + return (args[0] == args[1]); + } +}; + +template +struct CudaEqualReduceFunctor< + T, typename std::enable_if::value>::type> { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T args[]) const { + return fabs(static_cast(args[0] - args[1])) < 1e-8; + } +}; + template class CompareReduceOpKernel : public framework::OpKernel { @@ -44,32 +66,22 @@ class CompareReduceOpKernel auto* x = context.Input("X"); auto* y = context.Input("Y"); auto* z = context.Output("Out"); - bool shape_same = true; - + bool* z_data = z->mutable_data(context.GetPlace()); Tensor tmp; - framework::DDim x_dims = x->dims(); - framework::DDim y_dims = y->dims(); - if (x_dims.size() != y_dims.size()) { - shape_same = false; - } else { - for (auto i = 0; i < x_dims.size(); i++) { - if (x_dims[i] != y_dims[i]) { - shape_same = false; - break; - } - } - } - - bool* z_data = z->mutable_data(context.GetPlace()); - if (!shape_same) { + if (x->dims() != y->dims()) { thrust::device_ptr z_dev_ptr(z_data); thrust::fill(z_dev_ptr, z_dev_ptr + 1, false); return; } else { - tmp.mutable_data(x_dims, context.GetPlace()); - ElementwiseComputeEx(context, x, y, 0, - Functor(), &tmp); + tmp.mutable_data(x->dims(), context.GetPlace()); + const auto& cuda_ctx = + context.template device_context(); + std::vector ins = {x, y}; + std::vector outs = {&tmp}; + LaunchSameDimsElementwiseCudaKernel( + cuda_ctx, ins, &outs, Functor()); + // Reduce by 'bitwise and' operator std::vector reduce_dims; reduce_dims.resize(tmp.dims().size()); @@ -85,15 +97,17 @@ class CompareReduceOpKernel } // namespace operators } // namespace paddle -#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor) \ - REGISTER_OP_CUDA_KERNEL( \ - op_type, paddle::operators::CompareReduceOpKernel< \ - paddle::platform::CUDADeviceContext, functor>, \ - paddle::operators::CompareReduceOpKernel< \ - paddle::platform::CUDADeviceContext, functor>, \ - paddle::operators::CompareReduceOpKernel< \ - paddle::platform::CUDADeviceContext, functor>, \ - paddle::operators::CompareReduceOpKernel< \ - paddle::platform::CUDADeviceContext, functor>); -REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, - paddle::operators::EqualReduceFunctor); +#define REGISTER_COMPARE_REDUCE_CUDA_KERNEL(op_type, functor) \ + REGISTER_OP_CUDA_KERNEL( \ + op_type, \ + ops::CompareReduceOpKernel>, \ + ops::CompareReduceOpKernel>, \ + ops::CompareReduceOpKernel>, \ + ops::CompareReduceOpKernel>, \ + ops::CompareReduceOpKernel>); + +REGISTER_COMPARE_REDUCE_CUDA_KERNEL(equal_all, CudaEqualReduceFunctor) +#undef REGISTER_COMPARE_REDUCE_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index bf047de86fc21a4d5d9e9ff8f20c9a1982eb25af..a03e4165755dde3211425b028b474896249237f7 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -131,18 +131,18 @@ class CompareOp : public framework::OperatorWithKernel { REGISTER_COMPARE_OP(less_than, "Out = X < Y"); REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor, - paddle::operators::GreaterEqualFunctor); + paddle::operators::GreaterThanFunctor); REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor, - paddle::operators::GreaterThanFunctor); + paddle::operators::GreaterEqualFunctor); REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); REGISTER_COMPARE_KERNEL(greater_than, CPU, paddle::operators::GreaterThanFunctor, - paddle::operators::LessEqualFunctor); + paddle::operators::LessThanFunctor); REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); REGISTER_COMPARE_KERNEL(greater_equal, CPU, paddle::operators::GreaterEqualFunctor, - paddle::operators::LessThanFunctor); + paddle::operators::LessEqualFunctor); REGISTER_COMPARE_OP(equal, "Out = X == Y"); REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor, paddle::operators::EqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu index 3ca700e16e6e7bcf4136ca68dd895593a63824ec..bf7861a03d8d4da4ff1ae65ff62c761ffab914bd 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ b/paddle/fluid/operators/controlflow/compare_op.cu @@ -13,18 +13,84 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor, - paddle::operators::GreaterEqualFunctor); -REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor, - paddle::operators::GreaterThanFunctor); -REGISTER_COMPARE_KERNEL(greater_than, CUDA, - paddle::operators::GreaterThanFunctor, - paddle::operators::LessEqualFunctor); -REGISTER_COMPARE_KERNEL(greater_equal, CUDA, - paddle::operators::GreaterEqualFunctor, - paddle::operators::LessThanFunctor); -REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor, - paddle::operators::EqualFunctor); -REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor, - paddle::operators::NotEqualFunctor); +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +namespace paddle { +namespace operators { + +#define DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(func, op) \ + template \ + struct func { \ + using ELEMENT_TYPE = T; \ + inline HOSTDEVICE bool operator()(const T* args) const { \ + return args[0] op args[1]; \ + } \ + }; + +DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessThanFunctor, <) +DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaLessEqualFunctor, <=) +DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterThanFunctor, >) +DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaGreaterEqualFunctor, >=) +DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaEqualFunctor, ==) +DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT(CudaNotEqualFunctor, !=) +#undef DEFINE_CMP_BINARY_FUNCTOR_WITH_PONTER_INPUT + +template +struct CudaEqualFunctor< + T, typename std::enable_if::value>::type> { + using ELEMENT_TYPE = T; + HOSTDEVICE bool operator()(const T* args) const { + return fabs(static_cast(args[0] - args[1])) < 1e-8; + } +}; + +template +struct CudaNotEqualFunctor< + T, typename std::enable_if::value>::type> { + using ELEMENT_TYPE = T; + HOSTDEVICE bool operator()(const T* args) const { + return fabs(static_cast(args[0] - args[1])) > 1e-8; + } +}; + +template +class CompareOpKernel + : public framework::OpKernel { + public: + using InT = typename Functor::ELEMENT_TYPE; + using OutT = bool; + void Compute(const framework::ExecutionContext& ctx) const override { + auto functor = Functor(); + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, functor); + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_CUDA_COMPARE_KERNEL(op_type, func) \ + REGISTER_OP_CUDA_KERNEL( \ + op_type, \ + ops::CompareOpKernel, void>, \ + ops::CompareOpKernel, void>, \ + ops::CompareOpKernel, void>, \ + ops::CompareOpKernel, void>, \ + ops::CompareOpKernel, void>); + +REGISTER_CUDA_COMPARE_KERNEL(equal, CudaEqualFunctor) +REGISTER_CUDA_COMPARE_KERNEL(not_equal, CudaNotEqualFunctor) +REGISTER_CUDA_COMPARE_KERNEL(less_than, CudaLessThanFunctor) +REGISTER_CUDA_COMPARE_KERNEL(less_equal, CudaLessEqualFunctor) +REGISTER_CUDA_COMPARE_KERNEL(greater_than, CudaGreaterThanFunctor) +REGISTER_CUDA_COMPARE_KERNEL(greater_equal, CudaGreaterEqualFunctor) +#undef REGISTER_CUDA_COMPARE_KERNEL diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h index ff929ee7dfce79536a9ce7c8ae6878fb7e3871e9..36185322a96b8909c49e1a3c5a55afa47d4952bc 100644 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ b/paddle/fluid/operators/controlflow/compare_op.h @@ -98,6 +98,9 @@ class CompareOpKernel #define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor) \ REGISTER_OP_##dev##_KERNEL(op_type, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ ::paddle::operators::CompareOpKernel< \ ::paddle::platform::dev##DeviceContext, \ functor, inverse_functor>, \ diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc index 591fb55936734ffc675dad5c6912e7cbf4e80471..d1656fd079cd76446d12e553a1ff37af5bfeeeaa 100644 --- a/paddle/fluid/operators/controlflow/compare_op_npu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -34,7 +34,7 @@ class EqualNPUKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Equal", {*x, *y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); @@ -51,7 +51,7 @@ class LessThanNPUKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); // int axis = context.Attr("axis"); z->mutable_data(ctx.GetPlace()); // allocate - auto runner = NpuOpRunner("Less", {*x, *y}, {*z}); + const auto& runner = NpuOpRunner("Less", {*x, *y}, {*z}); auto stream = ctx.template device_context() .stream(); diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc index 62019be26cdef8214fe0e7c3e063c9387a30c91a..6705d42bcd74086e327d54fa44b9daf03efcba40 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc @@ -73,6 +73,8 @@ class ConditionalBlockInferOp : public ConditionalOp { framework::Executor exec(dev_place); auto *block = Attr("sub_block"); + VLOG(3) << "Conditional block.idx = " << block->ID() + << ", scope = " << &cur_scope; exec.Run(*block->Program(), &cur_scope, block->ID(), false); scope.DeleteScope(scopes->front()); } diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h index 22eb2ece4b05b8ad7fad3acdc545e3c98d211f31..7ce63aa9cbbfaaa4adb7834dd33e24cb6491a7a9 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/controlflow/conditional_block_op.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index fdd1b776bd8fa3f24fb596af29512f1f781dce4c..d86b6b48422d94604724303de72f401bfba2e23e 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -44,11 +44,6 @@ static void DataCopy(const framework::LoDTensor &src_item, TensorCopySync(src_item, platform::CPUPlace(), dst_item); } #else -#ifdef PADDLE_WITH_ASCEND_CL - if (platform::is_npu_place(src_item.place())) { - platform::DeviceContextPool::Instance().Get(src_item.place())->Wait(); - } -#endif TensorCopySync(src_item, platform::CPUPlace(), dst_item); #endif } else { diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu index 7ca54b488bfbb260c422941b82145f092a150be7..6cbcd516e08264499afdea00d081ae93eb8b319b 100644 --- a/paddle/fluid/operators/controlflow/logical_op.cu +++ b/paddle/fluid/operators/controlflow/logical_op.cu @@ -13,12 +13,68 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/controlflow/logical_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA, - paddle::operators::LogicalAndFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA, - paddle::operators::LogicalOrFunctor); -REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA, - paddle::operators::LogicalNotFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA, - paddle::operators::LogicalXorFunctor); +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +namespace paddle { +namespace operators { + +#define LOGICAL_BINARY_FUNCTOR(func_name, op) \ + template \ + struct func_name { \ + using ELEMENT_TYPE = T; \ + HOSTDEVICE bool operator()(const T* args) const { \ + return args[0] op args[1]; \ + } \ + }; + +LOGICAL_BINARY_FUNCTOR(CudaOrFunctor, ||) +LOGICAL_BINARY_FUNCTOR(CudaAndFunctor, &&) +LOGICAL_BINARY_FUNCTOR(CudaXorFunctor, ^) +#undef LOGICAL_BINARY_FUNCTOR + +template +struct CudaNotFunctor { + using ELEMENT_TYPE = T; + HOSTDEVICE bool operator()(const T* args) const { return !args[0]; } +}; + +template +class BinaryLogicalOpKernel + : public framework::OpKernel { + public: + using InT = typename Functor::ELEMENT_TYPE; + using OutT = bool; + void Compute(const framework::ExecutionContext& ctx) const override { + auto functor = Functor(); + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + + if (ins.size() == 1) { + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, functor); + } else { + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, functor); + } + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \ + REGISTER_OP_CUDA_KERNEL( \ + op_name, \ + ops::BinaryLogicalOpKernel>); + +REGISTER_LOGICAL_CUDA_KERNEL(logical_or, CudaOrFunctor) +REGISTER_LOGICAL_CUDA_KERNEL(logical_and, CudaAndFunctor) +REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, CudaXorFunctor) +REGISTER_LOGICAL_CUDA_KERNEL(logical_not, CudaNotFunctor) +#undef REGISTER_LOGICAL_CUDA_KERNEL diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc index 1b0c0e444347af0a90f8244590b84199dc97f931..b9807bfa53e1e116089f5a593d69f5110b0b8f10 100644 --- a/paddle/fluid/operators/controlflow/logical_op_npu.cc +++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc @@ -40,7 +40,7 @@ class LogicalNotNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); + const auto& runner = NpuOpRunner("LogicalNot", {*x}, {*out}, {}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/controlflow/unity_build_rule.cmake b/paddle/fluid/operators/controlflow/unity_build_rule.cmake index 6ed8f8a75374eaba122e7a3b3d935079a81756ee..f75785bd961c2543a20877d6b68d84471df96f41 100644 --- a/paddle/fluid/operators/controlflow/unity_build_rule.cmake +++ b/paddle/fluid/operators/controlflow/unity_build_rule.cmake @@ -12,9 +12,11 @@ register_unity_group(cc fetch_op.cc get_places_op.cc logical_op.cc + bitwise_op.cc tensor_array_read_write_op.cc while_op.cc) register_unity_group(cu logical_op.cu + bitwise_op.cu compare_op.cu compare_all_op.cu) diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 9825fcd8a6a67b9fd21e70e0870cc904ca9a9dbf..c6cd45dc18ba323407e3b3a0d5729c3b19a10c47 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/conv_search_cache.h" #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/cudnn_desc.h" namespace paddle { namespace operators { @@ -58,8 +59,8 @@ static void RemovePaddingSlice(const framework::ExecutionContext& context, *context.template device_context().eigen_device(); auto in_dims = input->dims(); auto new_out_dims = out->dims(); - auto offsets = Eigen::array(); - auto extents = Eigen::array(); + auto offsets = Eigen::DSizes(); + auto extents = Eigen::DSizes(); for (size_t i = 0; i < D; ++i) { offsets[i] = 0; extents[i] = new_out_dims[i]; @@ -81,7 +82,8 @@ static void RemovePaddingSlice(const framework::ExecutionContext& context, auto out_t = framework::EigenTensor::From( *out, new_out_dims); - out_t.device(place) = in_t.slice(offsets, extents); + EigenSlice, T, D>::Eval(place, out_t, in_t, + offsets, extents); } template @@ -209,20 +211,31 @@ struct SearchAlgorithm { #if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) auto& dev_ctx = ctx.template device_context(); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetConvolutionMathType( - args.cdesc.desc(), CUDNN_DEFAULT_MATH)); - VLOG(5) << "NOT use cudnn_tensor_op_math"; if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); VLOG(5) << "use cudnn_tensor_op_math"; - } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) { #if CUDA_VERSION >= 11000 +#if CUDNN_VERSION_MIN(8, 1, 0) + } else if (dev_ctx.GetComputeCapability() >= 80 && + dtype == CUDNN_DATA_BFLOAT16) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "use cudnn_tensor_op_math"; +#endif // CUDNN_VERSION >= 8100 + } else if (dtype == CUDNN_DATA_FLOAT && !args.cdesc.allow_tf32_) { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), CUDNN_FMA_MATH)); + VLOG(5) << "use cudnn_fma_math"; #endif // CUDA_VERSION >= 11000 + } else { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnSetConvolutionMathType(args.cdesc.desc(), + CUDNN_DEFAULT_MATH)); + VLOG(5) << "use cudnn_default_math"; } #endif diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index ab535e341f7575d4eef06af555b0aff4fa151f83..c49a3ee1c20ed32bd8d0504a28e4d7bb5f9917e3 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -699,24 +699,51 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f; +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + ScalingParamType beta = 0.0f; +#else ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; +#endif VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr("use_addto"); if (input_grad) { // When beta is 0, it is unnecessary to reset input_grad. // When beta is 1, the output cannot be reset since addt strategy used. #ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size)); - }, - workspace_size); + if (ctx.Attr("use_addto")) { + Tensor temp_tensor(transformed_input_grad.type()); + temp_tensor.Resize(transformed_input_grad.dims()); + T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args1.odesc.desc(), output_grad_data, + args1.wdesc.desc(), filter_data, args1.cdesc.desc(), + data_algo, &beta, args1.idesc.desc(), temp_tensor_data, + cudnn_workspace_ptr, workspace_size)); + }, + workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), + transformed_input_grad_data, &alpha, args1.idesc.desc(), + temp_tensor_data, &beta, args1.idesc.desc(), + transformed_input_grad_data)); + } else { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args1.odesc.desc(), output_grad_data, + args1.wdesc.desc(), filter_data, args1.cdesc.desc(), + data_algo, &beta, args1.idesc.desc(), + transformed_input_grad_data, cudnn_workspace_ptr, + workspace_size)); + }, + workspace_size); + } + #else for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( @@ -1386,6 +1413,31 @@ REGISTER_OP_KERNEL( paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); #else +#if CUDNN_VERSION_MIN(8, 1, 0) +REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); +REGISTER_OP_KERNEL( + conv2d_grad_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel); + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d_grad_grad, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel); +#else REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel, paddle::operators::CUDNNConvOpKernel, @@ -1405,6 +1457,7 @@ REGISTER_OP_CUDA_KERNEL( paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); +#endif REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel, diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 3ab27e1ec4f4fc68498270e7656d9dfb60bd9a92..befe09c8e6beb3d911521e4ff78f3427a3b0dd78 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -146,28 +146,8 @@ struct SearchAlgorithm { cudnn_workspace_ptr, workspace_size, false)); }; - if (!exhaustive_search && !deterministic) { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - algo = find_result.fwd_algo; - } else { - auto& temp = ctx.cuda_device_context(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - return find_result.fwd_algo; - }); - } + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.fwd_algo; VLOG(3) << "choose algo " << algo; return algo; } @@ -208,27 +188,8 @@ struct SearchAlgorithm { cudnn_workspace_ptr, workspace_size, false)); }; - if (!exhaustive_search && !deterministic) { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - algo = find_result.bwd_data_algo; - } else { - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - return find_result.bwd_data_algo; - }); - } + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_data_algo; VLOG(3) << "choose algo " << algo; return algo; } @@ -269,27 +230,8 @@ struct SearchAlgorithm { cudnn_workspace_ptr, workspace_size, false)); }; - if (!exhaustive_search && !deterministic) { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - algo = find_result.bwd_weights_algo; - } else { - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - return find_result.bwd_weights_algo; - }); - } + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_weights_algo; VLOG(3) << "choose algo " << algo; return algo; } diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 85bb4e5baa058a4cc5e6e4b9e1aec9ac75b3c5ea..1266cfe6081acf46fe66212adda23a396601965f 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -73,7 +73,17 @@ std::vector ConvOp::ComputeOutputShape( "the filter's dimension is %d.", in_dims, in_dims.size(), filter_dims, filter_dims.size())); - int in_sub_stride_size = in_dims.size() - strides.size(); + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], 0, + platform::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; PADDLE_ENFORCE_EQ( in_dims.size(), strides.size() + 2U, platform::errors::InvalidArgument( @@ -189,6 +199,15 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( platform::errors::InvalidArgument( "float16 can only be used when CUDNN is used")); } +#if PADDLE_WITH_CUDA + if (input_data_type == framework::proto::VarType::BF16 && + library == framework::LibraryType::kCUDNN) { + PADDLE_ENFORCE_GE( + platform::CudnnVersion(), 8100, + platform::errors::InvalidArgument( + "bfloat16 can only be used when CUDNN_VERSION >= 8100")); + } +#endif // PADDLE_WITH_CUDA auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, library, customized_type_value); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 4ea936d5104b83ce30e43fe214e7f1e0936325ee..f004ea1c69e0c5ba69f26a1e3141e6e407fad4be 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -66,7 +66,19 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { "input is [%s], the dimension size of input is [%d], the shape " "of filter is [%s], the dimension size of filter is [%d]. ", in_dims, in_dims.size(), filter_dims, filter_dims.size())); - int in_sub_stride_size = in_dims.size() - strides.size(); + + int stride_size = strides.size(); + for (int i = 0; i < stride_size; ++i) { + PADDLE_ENFORCE_GT( + strides[i], 0, + platform::errors::InvalidArgument( + "The stride of Op(Conv) should be larget than 0, but received " + "stride is %d.", + strides[i])); + } + + int in_sub_stride_size = in_dims.size() - stride_size; + PADDLE_ENFORCE_EQ( in_dims.size() - strides.size(), 2U, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index ecf5b6d774a2605c06bbeb2514c981b46e7f6a0d..b8335c75064286625997d2874fb076721afdde85 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/depthwise_conv.h" @@ -40,8 +41,8 @@ static void Slice(const framework::ExecutionContext& context, auto& place = *context.template device_context().eigen_device(); auto in_dims = input->dims(); - auto offsets = Eigen::array(); - auto extents = Eigen::array(); + auto offsets = Eigen::DSizes(); + auto extents = Eigen::DSizes(); for (size_t i = 0; i < D; ++i) { offsets[i] = 0; extents[i] = in_dims[i]; @@ -64,7 +65,8 @@ static void Slice(const framework::ExecutionContext& context, framework::EigenTensor::From( *out, out_dims); - out_t.device(place) = in_t.slice(offsets, extents); + EigenSlice, T, D>::Eval(place, out_t, in_t, + offsets, extents); out->Resize(out_dims); } diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu index a51fce8132418b09c8f2db397fc83c8c69a8a429..f488cc12e642b885f66d9b099ff211c9d419cbc6 100644 --- a/paddle/fluid/operators/correlation_op.cu +++ b/paddle/fluid/operators/correlation_op.cu @@ -12,17 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef PADDLE_WITH_HIP -// HIP not supported yet - #include #include #include "paddle/fluid/framework/op_registry.h" +#ifdef __HIPCC__ +#define __syncwarp() __all(1) +#endif + namespace paddle { namespace operators { +#ifdef __HIPCC__ +#define THREADS_PER_BLOCK 64 +#else #define THREADS_PER_BLOCK 32 +#endif #define FULL_MASK 0xffffffff using framework::Tensor; @@ -30,18 +35,27 @@ using framework::Tensor; template __forceinline__ __device__ T warpReduceSum(T val) { for (int offset = 16; offset > 0; offset /= 2) { +#ifdef __HIPCC__ + val += __shfl_down(val, offset); +#else val += __shfl_down_sync(FULL_MASK, val, offset); +#endif } return val; } template __forceinline__ __device__ T blockReduceSum(T val) { +#ifdef __HIPCC__ + static __shared__ T shared[64]; +#else static __shared__ T shared[32]; +#endif int lane = threadIdx.x % warpSize; int wid = threadIdx.x / warpSize; val = warpReduceSum(val); + __syncthreads(); if (lane == 0) shared[wid] = val; __syncthreads(); @@ -483,5 +497,3 @@ REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel, ops::CorrelationCUDAKernel); REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel, ops::CorrelationCUDAGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index 2031ed14242a1a2b4a441bf171bfeb31790506a3..193c0ca8dc0f4dbb6eff06f4899c53e7bf460cf7 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -220,3 +220,10 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( crop_grad, ops::CropGradKernel, ops::CropGradKernel); + +REGISTER_OP_CUDA_KERNEL( + crop, ops::CropKernel, + ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel, + ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu deleted file mode 100644 index 0a83e6aa57155b3bd85f8be02be9fa2f9cab39a8..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/crop_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/crop_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - crop, ops::CropKernel, - ops::CropKernel); -REGISTER_OP_CUDA_KERNEL( - crop_grad, ops::CropGradKernel, - ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h index 0338495096a7b1553152a80a68dc4e054859105c..f1fc216bd4feb470e0c811344428239c3ff9c9da 100644 --- a/paddle/fluid/operators/crop_op.h +++ b/paddle/fluid/operators/crop_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/strided_memcpy.h" namespace paddle { @@ -89,15 +90,16 @@ void CropFunction(const framework::ExecutionContext& context) { auto x_tensor = EigenTensor::From(*x); auto out_tensor = EigenTensor::From(*out); - Eigen::array e_offsets; - Eigen::array e_shape; + Eigen::DSizes e_offsets; + Eigen::DSizes e_shape; for (size_t i = 0; i < D; ++i) { e_offsets[i] = offsets[i]; e_shape[i] = out->dims()[i]; } auto& place = *context.template device_context().eigen_device(); - out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape); + EigenSlice, T, D>::Eval( + place, out_tensor, x_tensor, e_offsets, e_shape); } template @@ -148,16 +150,17 @@ void CropGradFunction(const framework::ExecutionContext& context) { auto* d_out = context.Input(framework::GradVarName("Out")); d_x->mutable_data(x->dims(), context.GetPlace()); auto offsets = GetOffsets(context); - Eigen::array, D> paddings; + Eigen::array, D> paddings; for (size_t i = 0; i < D; ++i) { paddings[i].first = offsets[i]; paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i]; } auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - d_x_tensor.device( - *context.template device_context().eigen_device()) = - d_out_tensor.pad(paddings, 0); + auto& place = + *context.template device_context().eigen_device(); + EigenPad, T, D>::Eval( + place, d_x_tensor, d_out_tensor, paddings, static_cast(0)); } } diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc index 514333c57f57cf3efa7b40f07d1a7c024e1d1715..28238082b18bf1279cb1ef4649aa8fd465c50b6b 100644 --- a/paddle/fluid/operators/crop_tensor_op.cc +++ b/paddle/fluid/operators/crop_tensor_op.cc @@ -319,3 +319,16 @@ REGISTER_OP_CPU_KERNEL( ops::CropTensorGradKernel, ops::CropTensorGradKernel, ops::CropTensorGradKernel); + +REGISTER_OP_CUDA_KERNEL( + crop_tensor, + ops::CropTensorKernel, + ops::CropTensorKernel, + ops::CropTensorKernel, + ops::CropTensorKernel); +REGISTER_OP_CUDA_KERNEL( + crop_tensor_grad, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel); diff --git a/paddle/fluid/operators/crop_tensor_op.cu b/paddle/fluid/operators/crop_tensor_op.cu deleted file mode 100644 index c3a144d1719d041dd56323850de04f6a1c71b29a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/crop_tensor_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/crop_tensor_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - crop_tensor, - ops::CropTensorKernel, - ops::CropTensorKernel, - ops::CropTensorKernel, - ops::CropTensorKernel); -REGISTER_OP_CUDA_KERNEL( - crop_tensor_grad, - ops::CropTensorGradKernel, - ops::CropTensorGradKernel, - ops::CropTensorGradKernel, - ops::CropTensorGradKernel); diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h index 58960465b90bd0eb427f78b00dfe21a7b0e7abe8..54666c8482c021bee2b9cc2679ccf4a65daf4cd7 100644 --- a/paddle/fluid/operators/crop_tensor_op.h +++ b/paddle/fluid/operators/crop_tensor_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/strided_memcpy.h" namespace paddle { @@ -199,15 +200,16 @@ void CropTensorFunction(const framework::ExecutionContext& context) { auto x_tensor = EigenTensor::From(*x); auto out_tensor = EigenTensor::From(*out); - Eigen::array e_offsets; - Eigen::array e_shape; + Eigen::DSizes e_offsets; + Eigen::DSizes e_shape; for (size_t i = 0; i < D; ++i) { e_offsets[i] = offsets[i]; e_shape[i] = out->dims()[i]; } auto& place = *context.template device_context().eigen_device(); - out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape); + EigenSlice, T, D>::Eval( + place, out_tensor, x_tensor, e_offsets, e_shape); } template @@ -259,16 +261,17 @@ void CropTensorGradFunction(const framework::ExecutionContext& context) { auto* d_out = context.Input(framework::GradVarName("Out")); d_x->mutable_data(x->dims(), context.GetPlace()); auto offsets = GetOffsets(context); - Eigen::array, D> paddings; + Eigen::array, D> paddings; for (size_t i = 0; i < D; ++i) { paddings[i].first = offsets[i]; paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i]; } auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - d_x_tensor.device( - *context.template device_context().eigen_device()) = - d_out_tensor.pad(paddings, 0); + auto& place = + *context.template device_context().eigen_device(); + EigenPad, T, D>::Eval( + place, d_x_tensor, d_out_tensor, paddings, static_cast(0)); } } diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd82c74885b9496bf64729a74a6527e68c80faf6 --- /dev/null +++ b/paddle/fluid/operators/decode_jpeg_op.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class CPUDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(LieLinJiang): add cpu implement. + PADDLE_THROW(platform::errors::Unimplemented( + "DecodeJpeg op only supports GPU now.")); + } +}; + +class DecodeJpegOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); + + auto mode = ctx->Attrs().Get("mode"); + std::vector out_dims; + + if (mode == "unchanged") { + out_dims = {-1, -1, -1}; + } else if (mode == "gray") { + out_dims = {1, -1, -1}; + } else if (mode == "rgb") { + out_dims = {3, -1, -1}; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU: ", mode)); + } + + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const { + if (var_name == "X") { + return expected_kernel_type; + } + + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } +}; + +class DecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A one dimensional uint8 tensor containing the raw bytes " + "of the JPEG image. It is a tensor with rank 1."); + AddOutput("Out", "The output tensor of DecodeJpeg op"); + AddComment(R"DOC( +This operator decodes a JPEG image into a 3 dimensional RGB Tensor +or 1 dimensional Gray Tensor. Optionally converts the image to the +desired format. The values of the output tensor are uint8 between 0 +and 255. +)DOC"); + AddAttr( + "mode", + "(string, default \"unchanged\"), The read mode used " + "for optionally converting the image, can be \"unchanged\" " + ",\"gray\" , \"rgb\" .") + .SetDefault("unchanged"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + decode_jpeg, ops::DecodeJpegOp, ops::DecodeJpegOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(decode_jpeg, ops::CPUDecodeJpegKernel) diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..11616b0e0c4daced68e8faf16a319d0c40f66244 --- /dev/null +++ b/paddle/fluid/operators/decode_jpeg_op.cu @@ -0,0 +1,138 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +namespace paddle { +namespace operators { + +static cudaStream_t nvjpeg_stream = nullptr; +static nvjpegHandle_t nvjpeg_handle = nullptr; + +void InitNvjpegImage(nvjpegImage_t* img) { + for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { + img->channel[c] = nullptr; + img->pitch[c] = 0; + } +} + +template +class GPUDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Create nvJPEG handle + if (nvjpeg_handle == nullptr) { + nvjpegStatus_t create_status = + platform::dynload::nvjpegCreateSimple(&nvjpeg_handle); + + PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegCreateSimple failed: ", + create_status)); + } + + nvjpegJpegState_t nvjpeg_state; + nvjpegStatus_t state_status = + platform::dynload::nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state); + + PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegJpegStateCreate failed: ", + state_status)); + + int components; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + + auto* x = ctx.Input("X"); + auto* x_data = x->data(); + + nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( + nvjpeg_handle, x_data, (size_t)x->numel(), &components, &subsampling, + widths, heights); + + PADDLE_ENFORCE_EQ( + info_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegGetImageInfo failed: ", info_status)); + + int width = widths[0]; + int height = heights[0]; + + nvjpegOutputFormat_t output_format; + int output_components; + + auto mode = ctx.Attr("mode"); + if (mode == "unchanged") { + if (components == 1) { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (components == 3) { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + } else if (mode == "gray") { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (mode == "rgb") { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + + nvjpegImage_t out_image; + InitNvjpegImage(&out_image); + + // create nvjpeg stream + if (nvjpeg_stream == nullptr) { + cudaStreamCreateWithFlags(&nvjpeg_stream, cudaStreamNonBlocking); + } + + int sz = widths[0] * heights[0]; + + auto* out = ctx.Output("Out"); + std::vector out_shape = {output_components, height, width}; + out->Resize(framework::make_ddim(out_shape)); + + T* data = out->mutable_data(ctx.GetPlace()); + + for (int c = 0; c < output_components; c++) { + out_image.channel[c] = data + c * sz; + out_image.pitch[c] = width; + } + + nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( + nvjpeg_handle, nvjpeg_state, x_data, x->numel(), output_format, + &out_image, nvjpeg_stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(decode_jpeg, ops::GPUDecodeJpegKernel) + +#endif diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc index 6f2a3ca87623847f261f0111bdfd8c168bb24b0a..e6f6c2a39358fdc94b36bd1aa2afd2e5d0a495c6 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cc +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -11,6 +11,7 @@ #include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { @@ -31,19 +32,44 @@ class YoloBoxOp : public framework::OperatorWithKernel { auto anchors = ctx->Attrs().Get>("anchors"); int anchor_num = anchors.size() / 2; auto class_num = ctx->Attrs().Get("class_num"); + auto iou_aware = ctx->Attrs().Get("iou_aware"); + auto iou_aware_factor = ctx->Attrs().Get("iou_aware_factor"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, platform::errors::InvalidArgument( "Input(X) should be a 4-D tensor." "But received X dimension(%s)", dim_x.size())); - PADDLE_ENFORCE_EQ( - dim_x[1], anchor_num * (5 + class_num), - platform::errors::InvalidArgument( - "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " - "+ class_num))." - "But received dim[1](%s) != (anchor_mask_number * " - "(5+class_num)(%s).", - dim_x[1], anchor_num * (5 + class_num))); + if (iou_aware) { + PADDLE_ENFORCE_EQ( + dim_x[1], anchor_num * (6 + class_num), + platform::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (6 " + "+ class_num)) while iou_aware is true." + "But received dim[1](%s) != (anchor_mask_number * " + "(6+class_num)(%s).", + dim_x[1], anchor_num * (6 + class_num))); + PADDLE_ENFORCE_GE( + iou_aware_factor, 0, + platform::errors::InvalidArgument( + "Attr(iou_aware_factor) should greater than or equal to 0." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + PADDLE_ENFORCE_LE( + iou_aware_factor, 1, + platform::errors::InvalidArgument( + "Attr(iou_aware_factor) should less than or equal to 1." + "But received iou_aware_factor (%s)", + iou_aware_factor)); + } else { + PADDLE_ENFORCE_EQ( + dim_x[1], anchor_num * (5 + class_num), + platform::errors::InvalidArgument( + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))." + "But received dim[1](%s) != (anchor_mask_number * " + "(5+class_num)(%s).", + dim_x[1], anchor_num * (5 + class_num))); + } PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2, platform::errors::InvalidArgument( "Input(ImgSize) should be a 2-D tensor." @@ -140,6 +166,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { "Scale the center point of decoded bounding " "box. Default 1.0") .SetDefault(1.); + AddAttr("iou_aware", "Whether use iou aware. Default false.") + .SetDefault(false); + AddAttr("iou_aware_factor", "iou aware factor. Default 0.5.") + .SetDefault(0.5); AddComment(R"DOC( This operator generates YOLO detection boxes from output of YOLOv3 network. @@ -147,7 +177,8 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { should be the same, H and W specify the grid size, each grid point predict given number boxes, this given number, which following will be represented as S, is specified by the number of anchors. In the second dimension(the channel - dimension), C should be equal to S * (5 + class_num), class_num is the object + dimension), C should be equal to S * (5 + class_num) if :attr:`iou_aware` is false, + otherwise C should be equal to S * (6 + class_num). class_num is the object category number of source dataset(such as 80 in coco dataset), so the second(channel) dimension, apart from 4 box location coordinates x, y, w, h, also includes confidence score of the box and class one-hot key of each anchor @@ -183,6 +214,15 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { score_{pred} = score_{conf} * score_{class} $$ + where the confidence scores follow the formula bellow + + .. math:: + + score_{conf} = \begin{case} + obj, \text{if } iou_aware == flase \\ + obj^{1 - iou_aware_factor} * iou^{iou_aware_factor}, \text{otherwise} + \end{case} + )DOC"); } }; @@ -197,3 +237,12 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, ops::YoloBoxKernel); + +REGISTER_OP_VERSION(yolo_box) + .AddCheckpoint( + R"ROC( + Upgrade yolo box to add new attribute [iou_aware, iou_aware_factor]. + )ROC", + paddle::framework::compatible::OpVersionDesc() + .NewAttr("iou_aware", "Whether use iou aware", false) + .NewAttr("iou_aware_factor", "iou aware factor", 0.5f)); diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu index 65dc73ef38323521590c9f5914ac13b321ef4469..83a0eb87d02dd549521b68a112c5d9eea6055159 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ b/paddle/fluid/operators/detection/yolo_box_op.cu @@ -28,7 +28,8 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, const int w, const int an_num, const int class_num, const int box_num, int input_size_h, int input_size_w, bool clip_bbox, const float scale, - const float bias) { + const float bias, bool iou_aware, + const float iou_aware_factor) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; T box[4]; @@ -43,23 +44,29 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, int img_height = imgsize[2 * i]; int img_width = imgsize[2 * i + 1]; - int obj_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, + iou_aware); T conf = sigmoid(input[obj_idx]); + if (iou_aware) { + int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); + T iou = sigmoid(input[iou_idx]); + conf = pow(conf, static_cast(1. - iou_aware_factor)) * + pow(iou, static_cast(iou_aware_factor)); + } if (conf < conf_thresh) { continue; } - int box_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, + iou_aware); GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, input_size_w, box_idx, grid_num, img_height, img_width, scale, bias); box_idx = (i * box_num + j * grid_num + k * w + l) * 4; CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - int label_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, + 5, iou_aware); int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, grid_num); @@ -80,6 +87,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel { float conf_thresh = ctx.Attr("conf_thresh"); int downsample_ratio = ctx.Attr("downsample_ratio"); bool clip_bbox = ctx.Attr("clip_bbox"); + bool iou_aware = ctx.Attr("iou_aware"); + float iou_aware_factor = ctx.Attr("iou_aware_factor"); float scale = ctx.Attr("scale_x_y"); float bias = -0.5 * (scale - 1.); @@ -111,11 +120,18 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel { platform::GpuLaunchConfig config = platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num); - KeYoloBoxFw<<<<>>( input_data, imgsize_data, boxes_data, scores_data, conf_thresh, anchors_data, n, h, w, an_num, class_num, box_num, input_size_h, - input_size_w, clip_bbox, scale, bias); + input_size_w, clip_bbox, scale, bias, iou_aware, iou_aware_factor); } }; diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h index 1cfef142bca7327cb039412719b7c002beb53cab..e06c81052a0f42c9db4d96e49d2708e64e4f3137 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.h +++ b/paddle/fluid/operators/detection/yolo_box_op.h @@ -13,6 +13,7 @@ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { @@ -43,8 +44,19 @@ HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride, - int entry) { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + int entry, bool iou_aware) { + if (iou_aware) { + return (batch * an_num + an_idx) * an_stride + + (batch * an_num + an_num + entry) * stride + hw_idx; + } else { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + } +} + +HOSTDEVICE inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, + int an_stride, int stride) { + return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + + hw_idx; } template @@ -92,6 +104,8 @@ class YoloBoxKernel : public framework::OpKernel { float conf_thresh = ctx.Attr("conf_thresh"); int downsample_ratio = ctx.Attr("downsample_ratio"); bool clip_bbox = ctx.Attr("clip_bbox"); + bool iou_aware = ctx.Attr("iou_aware"); + float iou_aware_factor = ctx.Attr("iou_aware_factor"); float scale = ctx.Attr("scale_x_y"); float bias = -0.5 * (scale - 1.); @@ -127,15 +141,22 @@ class YoloBoxKernel : public framework::OpKernel { for (int j = 0; j < an_num; j++) { for (int k = 0; k < h; k++) { for (int l = 0; l < w; l++) { - int obj_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4); + int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, + stride, 4, iou_aware); T conf = sigmoid(input_data[obj_idx]); + if (iou_aware) { + int iou_idx = + GetIoUIndex(i, j, k * w + l, an_num, an_stride, stride); + T iou = sigmoid(input_data[iou_idx]); + conf = pow(conf, static_cast(1. - iou_aware_factor)) * + pow(iou, static_cast(iou_aware_factor)); + } if (conf < conf_thresh) { continue; } - int box_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0); + int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, + stride, 0, iou_aware); GetYoloBox(box, input_data, anchors_data, l, k, j, h, w, input_size_h, input_size_w, box_idx, stride, img_height, img_width, scale, bias); @@ -143,8 +164,8 @@ class YoloBoxKernel : public framework::OpKernel { CalcDetectionBox(boxes_data, box, box_idx, img_height, img_width, clip_bbox); - int label_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5); + int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, + stride, 5, iou_aware); int score_idx = (i * box_num + j * stride + k * w + l) * class_num; CalcLabelScore(scores_data, input_data, label_idx, score_idx, class_num, conf, stride); diff --git a/paddle/fluid/operators/diagonal_op.cc b/paddle/fluid/operators/diagonal_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd5a84ade59cedf55e8cdb23cc2b8e7b886d7bd7 --- /dev/null +++ b/paddle/fluid/operators/diagonal_op.cc @@ -0,0 +1,186 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/diagonal_op.h" + +namespace paddle { +namespace operators { + +class DiagonalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "diagonal"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diagonal"); + + int offset_ = ctx->Attrs().Get("offset"); + int axis1 = ctx->Attrs().Get("axis1"); + int axis2 = ctx->Attrs().Get("axis2"); + + auto x_dims = ctx->GetInputDim("Input"); + int axis1_ = axis1 < 0 ? x_dims.size() + axis1 : axis1; + int axis2_ = axis2 < 0 ? x_dims.size() + axis2 : axis2; + + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + platform::errors::OutOfRange("Input's dim is out of range (expected at " + "least 2 dimensions, but got %ld).", + x_dims.size())); + PADDLE_ENFORCE_LT( + axis1_, x_dims.size(), + platform::errors::OutOfRange( + "Attr(axis1) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), (x_dims.size() - 1), axis1)); + PADDLE_ENFORCE_LT( + axis2_, x_dims.size(), + platform::errors::OutOfRange( + "Attr(axis2) is out of range (expected to be in range of [%ld, " + "%ld], but got %ld).", + -(x_dims.size()), (x_dims.size() - 1), axis2)); + PADDLE_ENFORCE_NE(axis1_, axis2_, + platform::errors::InvalidArgument( + "The dimensions should not be identical " + "%d vs %d.", + axis1, axis2)); + + auto out_dims = vectorize(x_dims); + // from out_dims get the dim size of axis1_. + auto axis1_size = out_dims[axis1_]; + auto axis2_size = out_dims[axis2_]; + // delete two dims by attr axis1 and axis2 from out_dims. + /* example: + out_dim = [2, 3, 4]; + axis1 = 0; + axis2 = 1; + according to the attr of axis1 and axis2, we get: + out_dim = [4]. + */ + out_dims.erase(out_dims.begin() + std::max(axis1_, axis2_)); + out_dims.erase(out_dims.begin() + std::min(axis1_, axis2_)); + + if (offset_ == 0) { + out_dims.push_back(std::min(axis1_size, axis2_size)); + } else if (offset_ > 0) { + if ((axis2_size - offset_) > 0) { + out_dims.push_back(std::min(axis1_size, axis2_size - offset_)); + } else { + out_dims.push_back(0); + } + } else { + if ((axis1_size + offset_) > 0) { + out_dims.push_back(std::min(axis1_size + offset_, axis2_size)); + } else { + out_dims.push_back(0); + } + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } +}; + +class DiagonalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(Tensor) The input tensor, from which the diagonals are taken."); + AddOutput( + "Out", + "(Tensor) The partial view of input with the its diagonal elements."); + AddAttr( + "offset", + R"DOC((int, default 0), offset of the diagonal from the main diagonal. Can be both positive and negative. Default: 0. + )DOC") + .SetDefault(0); + AddAttr( + "axis1", + R"DOC((int, default 0), the first axis of the 2-D planes from which the diagonals should be taken. + Can be either positive or negative. Default: 0. + )DOC") + .SetDefault(0); + AddAttr( + "axis2", + R"DOC((int, default 1), the second axis of the 2-D planes from which the diagonals should be taken. + Can be either positive or negative. Default: 1. + )DOC") + .SetDefault(1); + AddComment(R"DOC( +Diagonal Operator. +Return a partial view of input with the its diagonal elements of the input tensor. +The behavior of this operator is similar to how `numpy.diagonal` works. + +)DOC"); + } +}; + +class DiagonalGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "DiagonalGrad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Input")), "Output", + framework::GradVarName("Input"), "DiagonalGrad"); + + ctx->SetOutputDim(framework::GradVarName("Input"), + ctx->GetInputDim("Input")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class DiagonalGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override { + grad_op->SetType("diagonal_grad"); + grad_op->SetInput("Input", this->Input("Input")); + grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + grad_op->SetOutput(framework::GradVarName("Input"), + this->InputGrad("Input")); + grad_op->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERER(DiagonalGradNoNeedBufferVarsInferer, + "Input"); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(diagonal, ops::DiagonalOp, ops::DiagonalOpMaker, + ops::DiagonalGradOpMaker, + ops::DiagonalGradOpMaker); + +REGISTER_OPERATOR(diagonal_grad, ops::DiagonalGradOp, + ops::DiagonalGradNoNeedBufferVarsInferer) + +REGISTER_OP_CPU_KERNEL(diagonal, ops::DiagonalKernel, + ops::DiagonalKernel, ops::DiagonalKernel, + ops::DiagonalKernel, ops::DiagonalKernel); + +REGISTER_OP_CPU_KERNEL(diagonal_grad, ops::DiagonalGradKernel, + ops::DiagonalGradKernel, + ops::DiagonalGradKernel, + ops::DiagonalGradKernel); diff --git a/paddle/fluid/operators/diagonal_op.cu b/paddle/fluid/operators/diagonal_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..e2b5f24d6619e1dc70a3d84256ec1aeb18b90589 --- /dev/null +++ b/paddle/fluid/operators/diagonal_op.cu @@ -0,0 +1,273 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/diagonal_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void Diagonal(const T* data1, T* data2, const int64_t offset_, + int64_t axis1_, int64_t axis2_, int64_t* x_stride, + int64_t* out_stride, int64_t numel, bool is_grad) { + CUDA_KERNEL_LOOP(idx, numel) { + int64_t idx_dim[X_DIM_SIZE] = {0}; + int64_t temp = 0; + for (size_t i = 0; i < X_DIM_SIZE - 1; i++) { + idx_dim[i] = (idx - temp) / x_stride[i]; + temp = temp + idx_dim[i] * x_stride[i]; + } + idx_dim[X_DIM_SIZE - 1] = idx - temp; + + int64_t axis1_dim = idx_dim[axis1_]; + int64_t axis2_dim = idx_dim[axis2_]; + + int64_t out_dim[OUT_DIM_SIZE] = {0}; + int temp_pos = 0; + for (int i = 0; i < X_DIM_SIZE; i++) { + if (i != axis1_ && i != axis2_) { + out_dim[temp_pos] = idx_dim[i]; + temp_pos++; + } + } + bool flag = false; + if (offset_ == 0 && axis1_dim == axis2_dim) { + out_dim[temp_pos] = axis1_dim; + flag = true; + } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) { + out_dim[temp_pos] = axis1_dim; + flag = true; + } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) { + out_dim[temp_pos] = axis2_dim; + flag = true; + } + if (!is_grad) { + if (flag) { + int64_t idx_output = 0; + for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) { + idx_output = idx_output + out_dim[i] * out_stride[i]; + } + idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1]; + data2[idx_output] = data1[idx]; + } + } else { + if (flag) { + int64_t idx_output = 0; + for (size_t i = 0; i < OUT_DIM_SIZE - 1; i++) { + idx_output = idx_output + out_dim[i] * out_stride[i]; + } + idx_output = idx_output + out_dim[OUT_DIM_SIZE - 1]; + data2[idx] = data1[idx_output]; + } else { + data2[idx] = static_cast(0); + } + } + } +} + +template +class DiagonalCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + const auto* input_data = input->data(); + auto input_dim = input->dims().Get(); + auto input_dim_size = input->dims().size(); + + std::vector res_in = vectorize(framework::stride(input->dims())); + paddle::framework::Tensor input_stride_tensor; + framework::TensorFromVector(res_in, context.device_context(), + &input_stride_tensor); + int64_t* input_stride = input_stride_tensor.data(); + + auto* output = context.Output("Out"); + auto* output_data = output->mutable_data(context.GetPlace()); + auto output_dim = output->dims().Get(); + auto output_dim_size = output->dims().size(); + + std::vector res_out = vectorize(framework::stride(output->dims())); + paddle::framework::Tensor output_stride_tensor; + framework::TensorFromVector(res_out, context.device_context(), + &output_stride_tensor); + int64_t* output_stride = output_stride_tensor.data(); + + const int64_t offset_ = context.Attr("offset"); + const int64_t axis1 = context.Attr("axis1"); + int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1; + const int64_t axis2 = context.Attr("axis2"); + int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2; + int64_t numel = input->numel(); + + int threads = PADDLE_CUDA_NUM_THREADS; + int blocks = (numel + threads - 1) / threads; + + switch (input_dim_size) { + case 2: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + case 3: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + case 4: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + case 5: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + case 6: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + case 7: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + case 8: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + case 9: + Diagonal<<>>(input_data, output_data, offset_, + axis1_, axis2_, input_stride, + output_stride, numel, false); + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "The rank of input should be less than 10, but received %d.", + input_dim_size)); + } + } +}; + +template +class DiagonalGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* dout = + context.Input(framework::GradVarName("Out")); + const auto* dout_data = dout->data(); + auto dout_dim = dout->dims().Get(); + auto dout_dim_size = dout->dims().size(); + + std::vector res_dout = vectorize(framework::stride(dout->dims())); + paddle::framework::Tensor dout_stride_tensor; + framework::TensorFromVector(res_dout, context.device_context(), + &dout_stride_tensor); + int64_t* dout_stride = dout_stride_tensor.data(); + + auto* dx = + context.Output(framework::GradVarName("Input")); + auto* dx_data = dx->mutable_data(context.GetPlace()); + auto dx_dim = dx->dims().Get(); + auto dx_dim_size = dx->dims().size(); + + std::vector res_dx = vectorize(framework::stride(dx->dims())); + paddle::framework::Tensor dx_stride_tensor; + framework::TensorFromVector(res_dx, context.device_context(), + &dx_stride_tensor); + int64_t* dx_stride = dx_stride_tensor.data(); + + const int64_t offset_ = context.Attr("offset"); + const int64_t axis1 = context.Attr("axis1"); + int64_t axis1_ = axis1 < 0 ? dx_dim_size + axis1 : axis1; + const int64_t axis2 = context.Attr("axis2"); + int64_t axis2_ = axis2 < 0 ? dx_dim_size + axis2 : axis2; + + int64_t numel = dx->numel(); + + int threads = PADDLE_CUDA_NUM_THREADS; + int blocks = (numel + threads - 1) / threads; + + switch (dx_dim_size) { + case 2: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + case 3: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + case 4: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + case 5: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + case 6: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + case 7: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + case 8: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + case 9: + Diagonal<<>>(dout_data, dx_data, offset_, + axis1_, axis2_, dx_stride, + dout_stride, numel, true); + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "The rank of output(input@Grad) should be less than 10, but " + "received %d.", + dx_dim_size)); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(diagonal, ops::DiagonalCUDAKernel, + ops::DiagonalCUDAKernel, + ops::DiagonalCUDAKernel, + ops::DiagonalCUDAKernel, + ops::DiagonalCUDAKernel, + ops::DiagonalCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(diagonal_grad, ops::DiagonalGradCUDAKernel, + ops::DiagonalGradCUDAKernel, + ops::DiagonalGradCUDAKernel, + ops::DiagonalGradCUDAKernel, + ops::DiagonalGradCUDAKernel); diff --git a/paddle/fluid/operators/diagonal_op.h b/paddle/fluid/operators/diagonal_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a0380e9e52caced2e1ae65f87de3e3eb7266c1c8 --- /dev/null +++ b/paddle/fluid/operators/diagonal_op.h @@ -0,0 +1,163 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template + +std::vector ComputeDimStride(const std::vector dim) { + size_t dim_size = dim.size(); + std::vector dim_strides; + dim_strides.resize(dim_size); + for (size_t i = 0; i < dim_size - 1; i++) { + size_t temp_stride = 1; + for (size_t j = i + 1; j < dim_size; j++) { + temp_stride = temp_stride * dim[j]; + } + dim_strides[i] = temp_stride; + } + dim_strides[dim_size - 1] = 1; + return dim_strides; +} +template +class DiagonalKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("Input"); + const T* input_data = input->data(); + auto input_dim = vectorize(input->dims()); + auto input_dim_size = input_dim.size(); + + auto* output = context.Output("Out"); + T* output_data = output->mutable_data(context.GetPlace()); + auto output_dim = vectorize(output->dims()); + + const int64_t offset_ = context.Attr("offset"); + const int64_t axis1 = context.Attr("axis1"); + int64_t axis1_ = axis1 < 0 ? input_dim_size + axis1 : axis1; + const int64_t axis2 = context.Attr("axis2"); + int64_t axis2_ = axis2 < 0 ? input_dim_size + axis2 : axis2; + + std::vector input_stride = ComputeDimStride(input_dim); + std::vector output_stride = ComputeDimStride(output_dim); + + int64_t numel = input->numel(); + + for (int64_t idx = 0; idx < numel; idx++) { + std::vector idx_dim(input_dim_size); + int64_t temp = 0; + for (size_t i = 0; i < input_dim_size; i++) { + idx_dim[i] = (idx - temp) / input_stride[i]; + temp = temp + idx_dim[i] * input_stride[i]; + } + + int64_t axis1_dim = idx_dim[axis1_]; + int64_t axis2_dim = idx_dim[axis2_]; + + idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_)); + idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_)); + + bool flag = false; + if (offset_ == 0 && axis1_dim == axis2_dim) { + idx_dim.push_back(axis1_dim); + flag = true; + } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) { + idx_dim.push_back(axis1_dim); + flag = true; + } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) { + idx_dim.push_back(axis2_dim); + flag = true; + } + if (flag) { + int64_t idx_output = 0; + for (size_t i = 0; i < idx_dim.size(); i++) { + idx_output = idx_output + idx_dim[i] * output_stride[i]; + } + output_data[idx_output] = input_data[idx]; + } + } + } +}; + +template +class DiagonalGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* dout = + context.Input(framework::GradVarName("Out")); + const T* dout_data = dout->data(); + auto dout_dim = vectorize(dout->dims()); + + auto* dx = + context.Output(framework::GradVarName("Input")); + T* dx_data = dx->mutable_data(context.GetPlace()); + auto dx_dim = vectorize(dx->dims()); + auto dx_dim_size = dx_dim.size(); + + const int64_t offset_ = context.Attr("offset"); + const int64_t axis1 = context.Attr("axis1"); + int64_t axis1_ = axis1 < 0 ? dx_dim_size + axis1 : axis1; + const int64_t axis2 = context.Attr("axis2"); + int64_t axis2_ = axis2 < 0 ? dx_dim_size + axis2 : axis2; + + std::vector dout_stride = ComputeDimStride(dout_dim); + std::vector dx_stride = ComputeDimStride(dx_dim); + + int64_t numel = dx->numel(); + + for (int64_t idx = 0; idx < numel; idx++) { + std::vector idx_dim(dx_dim_size); + int64_t temp = 0; + for (size_t i = 0; i < dx_dim_size; i++) { + idx_dim[i] = (idx - temp) / dx_stride[i]; + temp = temp + idx_dim[i] * dx_stride[i]; + } + + int64_t axis1_dim = idx_dim[axis1_]; + int64_t axis2_dim = idx_dim[axis2_]; + + idx_dim.erase(idx_dim.begin() + std::max(axis1_, axis2_)); + idx_dim.erase(idx_dim.begin() + std::min(axis1_, axis2_)); + + bool flag = false; + if (offset_ == 0 && axis1_dim == axis2_dim) { + idx_dim.push_back(axis1_dim); + flag = true; + } else if (offset_ > 0 && (axis1_dim + offset_) == axis2_dim) { + idx_dim.push_back(axis1_dim); + flag = true; + } else if (offset_ < 0 && (axis1_dim + offset_) == axis2_dim) { + idx_dim.push_back(axis2_dim); + flag = true; + } + if (flag) { + int64_t idx_output = 0; + for (size_t i = 0; i < idx_dim.size(); i++) { + idx_output = idx_output + idx_dim[i] * dout_stride[i]; + } + dx_data[idx] = dout_data[idx_output]; + } else { + dx_data[idx] = static_cast(0); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/digamma_op.cc b/paddle/fluid/operators/digamma_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b1a58817e060434d0e309da3476edb5e96b5dfa3 --- /dev/null +++ b/paddle/fluid/operators/digamma_op.cc @@ -0,0 +1,100 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/digamma_op.h" + +namespace paddle { +namespace operators { + +class DigammaOp : public framework::OperatorWithKernel { + public: + DigammaOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Digamma"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Digamma"); + + auto in_dims = ctx->GetInputDim("X"); + + ctx->SetOutputDim("Out", in_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class DigammaOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of digamma operator."); + AddOutput("Out", "(Tensor), The output tensor of digamma operator."); + AddComment(R"DOC( +Digamma Operator. + +This operator is used to perform elementwise digamma for input $X$. +$$out = \Psi(x) = \frac{ \Gamma^{'}(x) }{ \Gamma(x) }$$ + +)DOC"); + } +}; + +class DigammaGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@Grad", "DigammaGrad"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DigammaGrad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + "X@Grad", "DigammaGrad"); + + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), dout_dims); + ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X")); + } +}; + +template +class DigammaGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("digamma_grad"); + retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + retv->SetInput("X", this->Input("X")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(digamma, ops::DigammaOp, ops::DigammaOpMaker, + ops::DigammaGradOpMaker, + ops::DigammaGradOpMaker); +REGISTER_OPERATOR(digamma_grad, ops::DigammaGradOp); + +REGISTER_OP_CPU_KERNEL( + digamma, ops::DigammaKernel, + ops::DigammaKernel); + +REGISTER_OP_CPU_KERNEL( + digamma_grad, + ops::DigammaGradKernel, + ops::DigammaGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.cu b/paddle/fluid/operators/digamma_op.cu similarity index 59% rename from paddle/fluid/operators/hinge_loss_op.cu rename to paddle/fluid/operators/digamma_op.cu index b5ea0a702e0e540c1831ca241a5def19f86c239c..5f2f59ba520d0fb1e2c083c211bceba0e4a25715 100644 --- a/paddle/fluid/operators/hinge_loss_op.cu +++ b/paddle/fluid/operators/digamma_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,12 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/hinge_loss_op.h" + +#include "paddle/fluid/operators/digamma_op.h" namespace ops = paddle::operators; + REGISTER_OP_CUDA_KERNEL( - hinge_loss, - ops::HingeLossKernel); + digamma, ops::DigammaKernel, + ops::DigammaKernel); + REGISTER_OP_CUDA_KERNEL( - hinge_loss_grad, - ops::HingeLossGradKernel); + digamma_grad, + ops::DigammaGradKernel, + ops::DigammaGradKernel); diff --git a/paddle/fluid/operators/digamma_op.h b/paddle/fluid/operators/digamma_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f82628f020480f5eca22079b13e586e1ebf13643 --- /dev/null +++ b/paddle/fluid/operators/digamma_op.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +struct DigammaFunctor { + DigammaFunctor(const T* input, T* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = Eigen::numext::digamma(input_[idx]); + } + + private: + const T* input_; + T* output_; + int64_t numel_; +}; + +template +struct DigammaGradFunctor { + DigammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) + : dout_(dout), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = dout_[idx] * Eigen::numext::polygamma(T(1), x_[idx]); + } + + private: + const T* dout_; + const T* x_; + T* output_; + int64_t numel_; +}; + +using Tensor = framework::Tensor; + +template +class DigammaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + Tensor* out = context.Output("Out"); + + auto numel = x->numel(); + auto* x_data = x->data(); + auto* out_data = out->mutable_data(context.GetPlace(), + size_t(x->numel() * sizeof(T))); + + auto& dev_ctx = context.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + DigammaFunctor functor(x_data, out_data, numel); + for_range(functor); + } +}; + +template +class DigammaGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* d_out = context.Input(framework::GradVarName("Out")); + const Tensor* x = context.Input("X"); + auto* d_x = context.Output(framework::GradVarName("X")); + + auto numel = d_out->numel(); + auto* dout_data = d_out->data(); + auto* x_data = x->data(); + auto* dx_data = d_x->mutable_data( + context.GetPlace(), static_cast(numel * sizeof(T))); + + auto& dev_ctx = context.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + DigammaGradFunctor functor(dout_data, x_data, dx_data, numel); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc index 26f12e8f9e3bfa088dfd7e7532dc1e99a5146a89..31acd9718115c78568326532e922aad543164732 100644 --- a/paddle/fluid/operators/dot_op.cc +++ b/paddle/fluid/operators/dot_op.cc @@ -33,7 +33,7 @@ class DotOp : public framework::OperatorWithKernel { "Output(Out) of DotOp should not be null.")); auto x_dims = ctx->GetInputDim("X"); - auto x_rank = (size_t)x_dims.size(); + auto x_rank = static_cast(x_dims.size()); PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank, platform::errors::PreconditionNotMet( "ShapeError: The dimensions of input tensor X (%s) " @@ -154,15 +154,15 @@ REGISTER_OP_CPU_KERNEL( ops::DotKernel, ops::DotKernel, ops::DotKernel, + paddle::platform::complex>, ops::DotKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( dot_grad, ops::DotGradKernel, ops::DotGradKernel, ops::DotGradKernel, ops::DotGradKernel, ops::DotGradKernel, + paddle::platform::complex>, ops::DotGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/dot_op.cu b/paddle/fluid/operators/dot_op.cu index 2d259ba1fbc9b4c495eb696e899ad94bb3b5e5be..49f27e1ffb12888e2361e6a504c85b02d84d6480 100644 --- a/paddle/fluid/operators/dot_op.cu +++ b/paddle/fluid/operators/dot_op.cu @@ -22,12 +22,14 @@ REGISTER_OP_CUDA_KERNEL( ops::DotKernel, ops::DotKernel, ops::DotKernel, - ops::DotKernel, - ops::DotKernel); -REGISTER_OP_CUDA_KERNEL( - dot_grad, ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel, - ops::DotGradKernel); + ops::DotKernel>, + ops::DotKernel>); +REGISTER_OP_CUDA_KERNEL(dot_grad, + ops::DotGradKernel, + ops::DotGradKernel, + ops::DotGradKernel, + ops::DotGradKernel, + ops::DotGradKernel>, + ops::DotGradKernel>); diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index 1b607922eda1d854567a338b51121e47064915e4..09d607891b48542876a374cbf00db713befde4b2 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -23,8 +23,6 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; -using complex64 = platform::complex64; -using complex128 = platform::complex128; template struct P { @@ -205,35 +203,25 @@ struct DotGradFunction> { } } #else - const auto* data_dout = tensor_dout->data(); + auto const *x = tensor_x->data(), *y = tensor_y->data(), + *dz = tensor_dout->data(); + auto&& d = tensor_x->dims(); + auto const N = tensor_x->numel(); + auto const B = d[d.size() - 1]; if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); - const auto* data_y = tensor_y->data(); - const framework::DDim& dim = tensor_x->dims(); - size_t N = static_cast(framework::product(dim)); - - auto step = dim[dim.size() - 1]; - - int s = -1; - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_dx[i] = data_y[i] * data_dout[s]; + auto* dx = tensor_dx->mutable_data(ctx.GetPlace()); + for (auto j = 0; j < N / B; ++j) { + auto const ss = dz[j]; + for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss; } } if (tensor_dy) { - auto* data_dy = tensor_dy->mutable_data(ctx.GetPlace()); - const auto* data_x = tensor_x->data(); - const framework::DDim& dim = tensor_y->dims(); - size_t N = static_cast(framework::product(dim)); - - auto step = dim[dim.size() - 1]; - - int s = -1; - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_dy[i] = data_x[i] * data_dout[s]; + auto* dy = tensor_dy->mutable_data(ctx.GetPlace()); + for (auto j = 0; j < N / B; ++j) { + auto const ss = dz[j]; + for (auto i = 0; i < B; i++) *dy++ = *x++ * ss; } } #endif @@ -266,21 +254,20 @@ class DotKernel : public framework::OpKernel { out.device(dev) = (x * y).sum(Eigen::DSizes(1)); } #else - const auto* data_x = tensor_x->data(); - const auto* data_y = tensor_y->data(); - auto* data_out = tensor_out->data(); - - auto x_dims = tensor_x->dims(); - auto step = x_dims[x_dims.size() - 1]; - int size = static_cast(framework::product(x_dims)); - - for (int ind = -1, j = 0; j < size; ++j) { - if (j % step == 0) { - ++ind; - data_out[ind] = data_x[j] * data_y[j]; - } else { - data_out[ind] += data_x[j] * data_y[j]; - } + auto const *x = tensor_x->data(), *x_ = &x[0]; + auto const *y = tensor_y->data(), *y_ = &y[0]; + auto* z = tensor_out->data(); + + // Loop over the total N elements of both operands while sum-reducing every + // B pairs along the way where B is the dimension of the least ordered axis + auto&& d = tensor_x->dims(); + auto const N = tensor_x->numel(); + auto const B = d[d.size() - 1]; + + for (int j = 0; j < N / B; j++) { + T ss = 0; + for (int i = 0; i < B; i++) ss += (*x_++) * (*y_++); + z[j] = ss; } #endif } diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..b5c8bfff0dc39f0d53308c702addf2fcf83bf796 --- /dev/null +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -0,0 +1,199 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class DropoutNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* seed_tensor = + ctx.HasInput("Seed") ? ctx.Input("Seed") : nullptr; + auto* out = ctx.Output("Out"); + auto* mask = ctx.Output("Mask"); + + auto dropout_prob = ctx.Attr("dropout_prob"); + auto is_test = ctx.Attr("is_test"); + + out->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + + if (dropout_prob == 1.) { + const auto& runner_zeros_out = NpuOpRunner("ZerosLike", {*out}, {*out}); + runner_zeros_out.Run(stream); + mask->mutable_data(ctx.GetPlace()); + const auto& runner_zeros_mask = + NpuOpRunner("ZerosLike", {*mask}, {*mask}); + runner_zeros_mask.Run(stream); + return; + } + + // only achive the default `upscale_in_train` method + if (!is_test) { + Tensor tmp_x(x->type()); + Tensor tmp_out(out->type()); + tmp_x.ShareDataWith(*x); + tmp_out.ShareDataWith(*out); + if (x->dims().size() == 1) { + // DropOutDoMask will get error result when input + // is 1-D. Make it become 2-D. + std::vector vec_dim = framework::vectorize(x->dims()); + tmp_x.Resize(framework::make_ddim({vec_dim[0], 1})); + tmp_out.Resize(framework::make_ddim({vec_dim[0], 1})); + } + + int seed = 0; + int seed2 = 0; + float keep_prob = 1. - dropout_prob; + if (seed_tensor) { + std::vector seed_data; + TensorToVector(*seed_tensor, ctx.device_context(), &seed_data); + seed = seed_data[0]; + } else { + seed = ctx.Attr("fix_seed") ? ctx.Attr("seed") : 0; + } + + Tensor keep_prob_tensor(x->type()); + keep_prob_tensor.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&keep_prob_tensor, + static_cast(keep_prob)); + + mask->mutable_data(ctx.GetPlace()); + + // mask used in `DropOutGenMask` NPU OP is different from + // the output `Mask`. + Tensor npu_mask(framework::proto::VarType::UINT8); + uint32_t length = (x->numel() + 128 - 1) / 128 * 128; + npu_mask.Resize(framework::make_ddim({length / 8})); + npu_mask.mutable_data(ctx.GetPlace()); + + // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU + // OP must be a scalar with shape[0]. At present, the shape + // of the `prob` Tensor of this OP is forced to be set to 0 + // in `npu_op_runner.cc`, which needs to be optimized later. + NpuOpRunner runner_gen_mask; + runner_gen_mask.SetType("DropOutGenMask") + .AddInput(framework::vectorize(tmp_out.dims())) + .AddInput(keep_prob_tensor) + .AddOutput(npu_mask) + .AddAttr("seed", seed) + .AddAttr("seed2", seed2); + runner_gen_mask.Run(stream); + + NpuOpRunner runner_dropout; + runner_dropout.SetType("DropOutDoMask") + .AddInput(tmp_x) + .AddInput(npu_mask) + .AddInput(keep_prob_tensor) + .AddOutput(tmp_out); + runner_dropout.Run(stream); + + // cast `out` from float/float16 to bool + Tensor cast_mask(framework::proto::VarType::BOOL); + cast_mask.Resize(mask->dims()); + cast_mask.mutable_data(ctx.GetPlace()); + auto dst_dtype_bool = ConvertToNpuDtype(cast_mask.type()); + const auto& runner_cast_mask_bool = + NpuOpRunner("Cast", {*out}, {cast_mask}, + {{"dst_type", static_cast(dst_dtype_bool)}}); + runner_cast_mask_bool.Run(stream); + + // cast cast_mask from bool to uint8 + auto dst_dtype_uint8 = ConvertToNpuDtype(mask->type()); + const auto& runner_cast_mask_uint8 = + NpuOpRunner("Cast", {cast_mask}, {*mask}, + {{"dst_type", static_cast(dst_dtype_uint8)}}); + runner_cast_mask_uint8.Run(stream); + } else { + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), out); + } + } +}; + +template +class DropoutGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* mask = ctx.Input("Mask"); + + auto dropout_prob = ctx.Attr("dropout_prob"); + auto is_test = ctx.Attr("is_test"); + + PADDLE_ENFORCE_EQ(is_test, false, + platform::errors::PreconditionNotMet( + "GradOp is only callable when is_test is false")); + + dx->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + + if (dropout_prob == 1.) { + const auto& runner_zeros = NpuOpRunner("ZerosLike", {*dx}, {*dx}); + runner_zeros.Run(stream); + return; + } + + // cast mask from uint8 to float32/float16 + Tensor cast_mask(dx->type()); + cast_mask.Resize(mask->dims()); + cast_mask.mutable_data(ctx.GetPlace()); + auto dst_dtype = ConvertToNpuDtype(dx->type()); + const auto& runner_cast_mask = + NpuOpRunner("Cast", {*mask}, {cast_mask}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_mask.Run(stream); + + const auto& runner = + NpuOpRunner("MaskedScale", {*dout, cast_mask}, {*dx}, + {{"value", static_cast(1. / (1 - dropout_prob))}}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + dropout, ops::DropoutNPUKernel, + ops::DropoutNPUKernel); + +REGISTER_OP_NPU_KERNEL( + dropout_grad, + ops::DropoutGradNPUKernel, + ops::DropoutGradNPUKernel); diff --git a/paddle/fluid/operators/dropout_op_xpu.cc b/paddle/fluid/operators/dropout_op_xpu.cc index f5d831fa24012031897eca2ce5a1cd9004f5a03b..79d239074845ad29f4f40e64a7d1ecc9f19168bb 100644 --- a/paddle/fluid/operators/dropout_op_xpu.cc +++ b/paddle/fluid/operators/dropout_op_xpu.cc @@ -16,11 +16,11 @@ namespace paddle { namespace operators { #ifdef PADDLE_WITH_XPU -static std::map mask_data_tables; -static const int max_data_size = 32 * 1024 * 1024; -static std::mutex s_mask_data_table_lock; + template class DropoutXPUKernel : public framework::OpKernel { + using XPUTyp = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto* x = context.Input("X"); @@ -30,93 +30,70 @@ class DropoutXPUKernel : public framework::OpKernel { float dropout_prob = context.Attr("dropout_prob"); auto dropout_implementation = context.Attr("dropout_implementation"); - float* mask_data_table = nullptr; + auto& dev_ctx = context.template device_context(); + PADDLE_ENFORCE_EQ(!context.HasInput("Seed"), true, platform::errors::InvalidArgument( ("Input(Seed) not supported on XPU"))); + int is_upscale = (dropout_implementation == "upscale_in_train"); + if (!context.Attr("is_test")) { - int dev_id = - BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()).GetDeviceId(); - int prop = static_cast(dropout_prob * 100); - int is_upscale = (dropout_implementation == "upscale_in_train"); - /* mask_data_tables key contains 3 part: - * | 31-16 | 15-8 | 7-0 | - * | dev_id | prob | is_upscale | - */ - int index = (dev_id << 16) + (prop << 8) + is_upscale; - std::lock_guard lock(s_mask_data_table_lock); - if (mask_data_tables.find(index) == mask_data_tables.end()) { - float* mask_data_host = new float[max_data_size]; - std::random_device rnd; - std::minstd_rand engine; - int seed = - context.Attr("fix_seed") ? context.Attr("seed") : rnd(); - engine.seed(seed); - std::uniform_real_distribution dist(0, 1); - for (size_t i = 0; i < max_data_size; ++i) { - if (dist(engine) < dropout_prob) { - mask_data_host[i] = 0.0f; - } else { - if (is_upscale) { - mask_data_host[i] = 1.0f / static_cast(1.0f - dropout_prob); - } else { - mask_data_host[i] = 1.0; - } - } - } - PADDLE_ENFORCE_EQ( - xpu_malloc(reinterpret_cast(&mask_data_table), - max_data_size * sizeof(float)), - XPU_SUCCESS, - platform::errors::ResourceExhausted( - "\n\nOut of memory error on XPU, Cannot" - "allocate %s memory on XPU. \n\nPlease " - "check whether there is any other process " - "using XPU.\n", - string::HumanReadableSize(max_data_size * sizeof(void*)))); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()), - mask_data_table, platform::CPUPlace(), mask_data_host, - max_data_size * sizeof(float)); - mask_data_tables[index] = mask_data_table; - free(mask_data_host); + std::random_device rnd; + // int seed = (context.Attr("fix_seed")) ? + // int(context.Attr("seed")) : (rnd()); + int seed = 0; + if (context.Attr("fix_seed") == true) { + seed = static_cast(context.Attr("seed")); } else { - mask_data_table = mask_data_tables[index]; + seed = rnd(); } - } - if (!context.Attr("is_test")) { // Train + auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); - size_t size = framework::product(mask->dims()); - auto& dev_ctx = context.template device_context(); - int r = xpu::dropout(dev_ctx.x_context(), mask_data_table, x_data, - mask_data, y_data, max_data_size, size); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU dropout return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); - } else { // Infer - float scale = 0.0f; - if (dropout_implementation == "upscale_in_train") { - scale = 1.0f; - } else { - scale = static_cast(1.0f - dropout_prob); + // Special case when dropout_prob is 1.0 + if (dropout_prob == 1.0f) { + int r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(y_data), y->numel(), + XPUTyp(0)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(constant) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + r = xpu::constant(dev_ctx.x_context(), + reinterpret_cast(mask_data), mask->numel(), + XPUTyp(0)); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(constant) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + return; } - auto& dev_ctx = context.template device_context(); - int r = xpu::scale(dev_ctx.x_context(), x->numel(), scale, 0.0f, 0, - x_data, y_data); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU dropout return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); + int r = xpu::dropout(dev_ctx.x_context(), + reinterpret_cast(x->data()), + reinterpret_cast(y->data()), + reinterpret_cast(mask_data), seed, + mask->numel(), is_upscale, dropout_prob); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(dropout) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } else { + float scale = + (is_upscale) ? (1.0) : (static_cast(1.0f - dropout_prob)); + int r = xpu::scale( + dev_ctx.x_context(), reinterpret_cast(x_data), + reinterpret_cast(y_data), x->numel(), false, scale, 0.0f); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } } }; template class DropoutGradXPUKernel : public framework::OpKernel { + using XPUTyp = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { PADDLE_ENFORCE_EQ(!context.Attr("is_test"), true, @@ -127,23 +104,47 @@ class DropoutGradXPUKernel : public framework::OpKernel { auto* mask = context.Input("Mask"); grad_x->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - int r = xpu::elementwise_mul(dev_ctx.x_context(), grad_y->data(), - mask->data(), grad_x->data(), - grad_y->numel()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External( - "XPU dropout return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); + auto& dropout_implementation = + context.Attr("dropout_implementation"); + float dropout_prob = context.Attr("dropout_prob"); + const T* mask_data = mask->data(); + framework::Tensor mask_new; + if (dropout_implementation == "upscale_in_train") { + mask_new = context.AllocateTmpTensor( + mask->dims(), dev_ctx); + float scale = + (dropout_prob == 1.0f) ? (1.0f) : (1.0f / (1.0f - dropout_prob)); + int r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(mask->data()), + reinterpret_cast(mask_new.data()), + mask->numel(), false, scale, 0.0f); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(scale) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + mask_data = mask_new.data(); + } + + int r = xpu::mul( + dev_ctx.x_context(), reinterpret_cast(grad_y->data()), + reinterpret_cast(mask_data), + reinterpret_cast(grad_x->data()), grad_y->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(mul) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_XPU_KERNEL( - dropout, ops::DropoutXPUKernel); + dropout, ops::DropoutXPUKernel, + ops::DropoutXPUKernel); REGISTER_OP_XPU_KERNEL( dropout_grad, - ops::DropoutGradXPUKernel); + ops::DropoutGradXPUKernel, + ops::DropoutGradXPUKernel); #endif diff --git a/paddle/fluid/operators/eigen/CMakeLists.txt b/paddle/fluid/operators/eigen/CMakeLists.txt index 848bf2433c5e394bf00f4b335b83da4e0fdec144..8b64e35b93526eb7edbe7f723832126ef7f0e0a6 100644 --- a/paddle/fluid/operators/eigen/CMakeLists.txt +++ b/paddle/fluid/operators/eigen/CMakeLists.txt @@ -1,10 +1,9 @@ file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") -cc_library(eigen_cc_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3) -if(WITH_GPU OR WITH_ROCM) - file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu") - if(WITH_GPU) - nv_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3) - elseif(WITH_ROCM) - hip_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3) - endif() +file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu") +if(WITH_GPU) + nv_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3) +elseif(WITH_ROCM) + hip_library(eigen_function SRCS ${EIGEN_CC_SOURCES} ${EIGEN_CU_SOURCES} DEPS eigen3) +else() + cc_library(eigen_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3) endif() diff --git a/paddle/fluid/operators/eigen/constant.cc b/paddle/fluid/operators/eigen/constant.cc new file mode 100644 index 0000000000000000000000000000000000000000..45b03ccbf10043ad142c7de15d7cdf110e134f9a --- /dev/null +++ b/paddle/fluid/operators/eigen/constant.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenConstant { + using Type = Eigen::TensorMap< + Eigen::Tensor>; + static void Eval(const Eigen::DefaultDevice& dev, Type out, const T value) { + out.device(dev) = out.constant(value); + } +}; + +template struct EigenConstant; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/constant.cu b/paddle/fluid/operators/eigen/constant.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf4a2917f7d36f817b53aa892ff1b43b347086c8 --- /dev/null +++ b/paddle/fluid/operators/eigen/constant.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenConstant { + using Type = Eigen::TensorMap< + Eigen::Tensor>; + static void Eval(const Eigen::GpuDevice& dev, Type out, const T value) { + out.device(dev) = out.constant(value); + } +}; + +template struct EigenConstant; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h index 59669505959f3f2b9d2b5d378e1e0b297df1718e..9a3be7ca439b9aead2e931c7fa3036128400b057 100644 --- a/paddle/fluid/operators/eigen/eigen_function.h +++ b/paddle/fluid/operators/eigen/eigen_function.h @@ -12,6 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif +#ifndef NOMINMAX +#define NOMINMAX +#endif #include "unsupported/Eigen/CXX11/Tensor" namespace paddle { @@ -48,5 +54,207 @@ struct EigenBroadcastGrad { const Array& reduce_dims, const Array2& reshape_dims); }; +template +struct EigenConstant { + using Type = Eigen::TensorMap< + Eigen::Tensor>; + static void Eval(const EigenDevice& dev, Type out, const T value); +}; + +template +struct EigenSign { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in); +}; + +template +struct EigenReverse { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in, + const Array& reverse); +}; + +template +struct EigenAdd { + using InType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + using OutType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in, + const T value); +}; + +template +struct EigenSub { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType out, const InType& left, + const InType& right); +}; + +template +struct EigenSlice { + using Array = Eigen::DSizes; + using Array32Bit = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in, + const Array& offsets, const Array& extents); + static void Eval(const EigenDevice& dev, OutType32BitIndex out, + const InType32BitIndex& in, const Array32Bit& offsets, + const Array32Bit& extents); +}; + +template +struct EigenPad { + using Array = std::array, Rank>; + using Array32Bit = std::array, Rank>; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in, + const Array& padding, const T value); + static void Eval(const EigenDevice& dev, OutType32BitIndex out, + const InType32BitIndex& in, const Array32Bit& padding, + const T value); +}; + +template +struct EigenScale { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in, + const T scale, const T bias, const bool bias_after_scale); +}; + +template +struct EigenErf { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in); +}; + +template +struct EigenErfGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType din, const InType& in, + const InType& dout); +}; + +template +struct EigenRankLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType out, const InType& label, + const InType& left, const InType& right); +}; + +template +struct EigenRankLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void EvalLeft(const EigenDevice& dev, OutType dleft, + const InType& dout, const InType& label, + const InType& left, const InType& right); + static void EvalRight(const EigenDevice& dev, OutType dright, + const InType& dout, const InType& label, + const InType& left, const InType& right); +}; + +template +struct EigenLogLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType out, const InType& pred, + const InType& label, const T& epsilon); +}; + +template +struct EigenLogLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss, + const InType& pred, const InType& label, const T& epsilon); +}; + +template +struct EigenHingeLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType loss, const InType& pred, + const InType& label); +}; + +template +struct EigenHingeLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType dpred, const InType& dloss, + const InType& pred, const InType& label); +}; + +template +struct EigenL1Norm { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + static void Eval(const EigenDevice& dev, OutType out, const InType& in); +}; + +template +struct EigenL1NormGrad { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType din, const InType& dout, + const InType& in, const Array& bcast); +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/eigen/elementwise.cc b/paddle/fluid/operators/eigen/elementwise.cc new file mode 100644 index 0000000000000000000000000000000000000000..bedecfe5c224feda5126050be1f80843db5b0a87 --- /dev/null +++ b/paddle/fluid/operators/eigen/elementwise.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenAdd { + using InType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + using OutType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in, const T value) { + out.device(dev) = in + value; + } +}; + +template struct EigenAdd; +template struct EigenAdd; +template struct EigenAdd; +template struct EigenAdd; + +template +struct EigenSub { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& left, const InType& right) { + out.device(dev) = left - right; + } +}; + +template struct EigenSub; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/elementwise.cu b/paddle/fluid/operators/eigen/elementwise.cu new file mode 100644 index 0000000000000000000000000000000000000000..a750a06284f5e44fa71440820e2c40c0868f4e6f --- /dev/null +++ b/paddle/fluid/operators/eigen/elementwise.cu @@ -0,0 +1,51 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenAdd { + using InType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + using OutType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in, + const T value) { + out.device(dev) = in + value; + } +}; + +template struct EigenAdd; +template struct EigenAdd; +template struct EigenAdd; +template struct EigenAdd; + +template +struct EigenSub { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& left, + const InType& right) { + out.device(dev) = left - right; + } +}; + +template struct EigenSub; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/erf.cc b/paddle/fluid/operators/eigen/erf.cc new file mode 100644 index 0000000000000000000000000000000000000000..6c2c734c97769418fa9316150c606909acf33eba --- /dev/null +++ b/paddle/fluid/operators/eigen/erf.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/eigen_ext.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenErf { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in) { + out.device(dev) = in.erf(); + } +}; + +template +struct EigenErfGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType din, + const InType& in, const InType& dout) { + din.device(dev) = + dout * static_cast(M_2_SQRTPI) * (-(in.square())).exp(); + } +}; + +#define INSTANTIATION(FUNCTOR) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenErf); +INSTANTIATION(EigenErfGrad); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/erf.cu b/paddle/fluid/operators/eigen/erf.cu new file mode 100644 index 0000000000000000000000000000000000000000..632205bdcbf7efaf6004e071ea078739742a417f --- /dev/null +++ b/paddle/fluid/operators/eigen/erf.cu @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/eigen_ext.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenErf { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in) { + out.device(dev) = in.erf(); + } +}; + +template +struct EigenErfGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType din, const InType& in, + const InType& dout) { + din.device(dev) = + dout * static_cast(M_2_SQRTPI) * (-(in.square())).exp(); + } +}; + +#define INSTANTIATION(FUNCTOR) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenErf); +INSTANTIATION(EigenErfGrad); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/l1_norm.cc b/paddle/fluid/operators/eigen/l1_norm.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7ed60f76662eb7907f4884d93149f6f49bc0bc8 --- /dev/null +++ b/paddle/fluid/operators/eigen/l1_norm.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenL1Norm { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in) { + out.device(dev) = in.abs().sum(); + } +}; + +template +struct EigenL1NormGrad { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType din, + const InType& dout, const InType& in, const Array& bcast) { + din.device(dev) = dout.broadcast(bcast) * in.sign(); + } +}; + +template struct EigenL1Norm; +template struct EigenL1NormGrad; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/l1_norm.cu b/paddle/fluid/operators/eigen/l1_norm.cu new file mode 100644 index 0000000000000000000000000000000000000000..a27cd7ae6b7898d8d7fe4001cdfd447d02e19cb7 --- /dev/null +++ b/paddle/fluid/operators/eigen/l1_norm.cu @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenL1Norm { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = Eigen::TensorMap, Eigen::RowMajor, Eigen::DenseIndex>>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in) { + out.device(dev) = in.abs().sum(); + } +}; + +template +struct EigenL1NormGrad { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType din, const InType& dout, + const InType& in, const Array& bcast) { + din.device(dev) = dout.broadcast(bcast) * in.sign(); + } +}; + +template struct EigenL1Norm; +template struct EigenL1NormGrad; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/loss.cc b/paddle/fluid/operators/eigen/loss.cc new file mode 100644 index 0000000000000000000000000000000000000000..469456537d9aa20564cf9abe2bf1ece735534be3 --- /dev/null +++ b/paddle/fluid/operators/eigen/loss.cc @@ -0,0 +1,123 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenRankLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& label, const InType& left, + const InType& right) { + out.device(dev) = + (1.0f + (left - right).exp()).log() - label * (left - right); + } +}; + +template +struct EigenRankLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + + static void EvalLeft(const Eigen::DefaultDevice& dev, OutType dleft, + const InType& dout, const InType& label, + const InType& left, const InType& right) { + dleft.device(dev) = dout * (1.0f / (1.0f + (right - left).exp()) - label); + } + + static void EvalRight(const Eigen::DefaultDevice& dev, OutType dright, + const InType& dout, const InType& label, + const InType& left, const InType& right) { + dright.device(dev) = -dout * (1.0f / (1.0f + (right - left).exp()) - label); + } +}; + +template struct EigenRankLoss; +template struct EigenRankLossGrad; + +template +struct EigenLogLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& pred, const InType& label, const T& epsilon) { + out.device(dev) = (-(label * (pred + epsilon).log()) - + ((static_cast(1) - label) * + (static_cast(1) - pred + epsilon).log())); + } +}; + +template +struct EigenLogLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType dpred, + const InType& dloss, const InType& pred, const InType& label, + const T& epsilon) { + dpred.device(dev) = + dloss * + (-(label / (pred + epsilon)) + + ((static_cast(1) - label) / (static_cast(1) - pred + epsilon))); + } +}; + +template struct EigenLogLoss; +template struct EigenLogLossGrad; + +template +struct EigenHingeLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType loss, + const InType& pred, const InType& label) { + loss.device(dev) = (static_cast(1) - + pred * (static_cast(2) * label - static_cast(1))) + .cwiseMax(static_cast(0)); + } +}; + +template +struct EigenHingeLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType dpred, + const InType& dloss, const InType& pred, + const InType& label) { + auto alt_labels = static_cast(2) * label - static_cast(1); + dpred.device(dev) = + dloss * ((pred * alt_labels) < static_cast(1)).template cast() * + (-alt_labels); + } +}; + +template struct EigenHingeLoss; +template struct EigenHingeLossGrad; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/loss.cu b/paddle/fluid/operators/eigen/loss.cu new file mode 100644 index 0000000000000000000000000000000000000000..02341202a2b4f18acc79f7bd4d4c69a69a039eca --- /dev/null +++ b/paddle/fluid/operators/eigen/loss.cu @@ -0,0 +1,123 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenRankLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, + const InType& label, const InType& left, + const InType& right) { + out.device(dev) = + (1.0f + (left - right).exp()).log() - label * (left - right); + } +}; + +template +struct EigenRankLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + + static void EvalLeft(const Eigen::GpuDevice& dev, OutType dleft, + const InType& dout, const InType& label, + const InType& left, const InType& right) { + dleft.device(dev) = dout * (1.0f / (1.0f + (right - left).exp()) - label); + } + + static void EvalRight(const Eigen::GpuDevice& dev, OutType dright, + const InType& dout, const InType& label, + const InType& left, const InType& right) { + dright.device(dev) = -dout * (1.0f / (1.0f + (right - left).exp()) - label); + } +}; + +template struct EigenRankLoss; +template struct EigenRankLossGrad; + +template +struct EigenLogLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& pred, + const InType& label, const T& epsilon) { + out.device(dev) = (-(label * (pred + epsilon).log()) - + ((static_cast(1) - label) * + (static_cast(1) - pred + epsilon).log())); + } +}; + +template +struct EigenLogLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType dpred, + const InType& dloss, const InType& pred, const InType& label, + const T& epsilon) { + dpred.device(dev) = + dloss * + (-(label / (pred + epsilon)) + + ((static_cast(1) - label) / (static_cast(1) - pred + epsilon))); + } +}; + +template struct EigenLogLoss; +template struct EigenLogLossGrad; + +template +struct EigenHingeLoss { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType loss, + const InType& pred, const InType& label) { + loss.device(dev) = (static_cast(1) - + pred * (static_cast(2) * label - static_cast(1))) + .cwiseMax(static_cast(0)); + } +}; + +template +struct EigenHingeLossGrad { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType dpred, + const InType& dloss, const InType& pred, + const InType& label) { + auto alt_labels = static_cast(2) * label - static_cast(1); + dpred.device(dev) = + dloss * ((pred * alt_labels) < static_cast(1)).template cast() * + (-alt_labels); + } +}; + +template struct EigenHingeLoss; +template struct EigenHingeLossGrad; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/pad.cc b/paddle/fluid/operators/eigen/pad.cc new file mode 100644 index 0000000000000000000000000000000000000000..421c9eaf5cde2bbbca56512685903ee3dc28fc49 --- /dev/null +++ b/paddle/fluid/operators/eigen/pad.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/complex.h" + +namespace paddle { +namespace operators { + +template +struct EigenPad { + using Array = std::array, Rank>; + using Array32Bit = std::array, Rank>; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in, const Array& padding, const T value) { + out.device(dev) = in.pad(padding, value); + } + + static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out, + const InType32BitIndex& in, const Array32Bit& padding, + const T value) { + out.device(dev) = in.pad(padding, value); + } +}; + +#define INSTANTIATION(FUNCTOR, TYPE) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenPad, int); +INSTANTIATION(EigenPad, int64_t); +INSTANTIATION(EigenPad, float); +INSTANTIATION(EigenPad, double); +INSTANTIATION(EigenPad, platform::complex); +INSTANTIATION(EigenPad, platform::complex); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/pad.cu b/paddle/fluid/operators/eigen/pad.cu new file mode 100644 index 0000000000000000000000000000000000000000..4cf88712d95cbb2e526068ebdfca9999e5fda449 --- /dev/null +++ b/paddle/fluid/operators/eigen/pad.cu @@ -0,0 +1,67 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenPad { + using Array = std::array, Rank>; + using Array32Bit = std::array, Rank>; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in, + const Array& padding, const T value) { + out.device(dev) = in.pad(padding, value); + } + + static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out, + const InType32BitIndex& in, const Array32Bit& padding, + const T value) { + out.device(dev) = in.pad(padding, value); + } +}; + +#define INSTANTIATION(FUNCTOR, TYPE) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenPad, int); +INSTANTIATION(EigenPad, int64_t); +INSTANTIATION(EigenPad, float); +INSTANTIATION(EigenPad, double); +INSTANTIATION(EigenPad, platform::float16); +INSTANTIATION(EigenPad, platform::bfloat16); +INSTANTIATION(EigenPad, platform::complex); +INSTANTIATION(EigenPad, platform::complex); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/reverse.cc b/paddle/fluid/operators/eigen/reverse.cc new file mode 100644 index 0000000000000000000000000000000000000000..02044479db952ff27c06148ca39c4a2a3e36330a --- /dev/null +++ b/paddle/fluid/operators/eigen/reverse.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenReverse { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in, const Array& reverse) { + out.device(dev) = in.reverse(reverse); + } +}; + +#define INSTANTIATION(FUNCTOR, TYPE) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenReverse, int); +INSTANTIATION(EigenReverse, uint8_t); +INSTANTIATION(EigenReverse, int64_t); +INSTANTIATION(EigenReverse, bool); +INSTANTIATION(EigenReverse, float); +INSTANTIATION(EigenReverse, double); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/reverse.cu b/paddle/fluid/operators/eigen/reverse.cu new file mode 100644 index 0000000000000000000000000000000000000000..9b769489ce723678b2cc1440bf6c3d374e3a55d6 --- /dev/null +++ b/paddle/fluid/operators/eigen/reverse.cu @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenReverse { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in, + const Array& reverse) { + out.device(dev) = in.reverse(reverse); + } +}; + +#define INSTANTIATION(FUNCTOR, TYPE) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenReverse, int); +INSTANTIATION(EigenReverse, uint8_t); +INSTANTIATION(EigenReverse, int64_t); +INSTANTIATION(EigenReverse, bool); +INSTANTIATION(EigenReverse, float); +INSTANTIATION(EigenReverse, double); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/scale.cc b/paddle/fluid/operators/eigen/scale.cc new file mode 100644 index 0000000000000000000000000000000000000000..e85878f20aa2b80b398561938ad96f6349cb7eec --- /dev/null +++ b/paddle/fluid/operators/eigen/scale.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/bfloat16.h" + +namespace paddle { +namespace operators { + +template +struct EigenScale { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in, const T scale, const T bias, + const bool bias_after_scale) { + if (bias_after_scale) { + out.device(dev) = scale * in + bias; + } else { + out.device(dev) = scale * (in + bias); + } + } +}; + +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/scale.cu b/paddle/fluid/operators/eigen/scale.cu new file mode 100644 index 0000000000000000000000000000000000000000..6a77f72f6200c0640d08e5ba9e1ddfb39211aaed --- /dev/null +++ b/paddle/fluid/operators/eigen/scale.cu @@ -0,0 +1,46 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenScale { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in, + const T scale, const T bias, const bool bias_after_scale) { + if (bias_after_scale) { + out.device(dev) = scale * in + bias; + } else { + out.device(dev) = scale * (in + bias); + } + } +}; + +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; +template struct EigenScale; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/sign.cc b/paddle/fluid/operators/eigen/sign.cc new file mode 100644 index 0000000000000000000000000000000000000000..4a4445f6569d388a4181eec1bed2faf190aeb729 --- /dev/null +++ b/paddle/fluid/operators/eigen/sign.cc @@ -0,0 +1,35 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" + +namespace paddle { +namespace operators { + +template +struct EigenSign { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in) { + out.device(dev) = in.sign(); + } +}; + +template struct EigenSign; +template struct EigenSign; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/sign.cu b/paddle/fluid/operators/eigen/sign.cu new file mode 100644 index 0000000000000000000000000000000000000000..52c8d3c80dd2c5d0d64e9a92ae596d7b69e70476 --- /dev/null +++ b/paddle/fluid/operators/eigen/sign.cu @@ -0,0 +1,37 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/eigen_ext.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenSign { + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in) { + out.device(dev) = in.sign(); + } +}; + +template struct EigenSign; +template struct EigenSign; +template struct EigenSign; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/slice.cc b/paddle/fluid/operators/eigen/slice.cc new file mode 100644 index 0000000000000000000000000000000000000000..2579b5f07eb27817f5488d8065fa05f409d1163f --- /dev/null +++ b/paddle/fluid/operators/eigen/slice.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenSlice { + using Array = Eigen::DSizes; + using Array32Bit = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + + static void Eval(const Eigen::DefaultDevice& dev, OutType out, + const InType& in, const Array& offsets, + const Array& extents) { + out.device(dev) = in.slice(offsets, extents); + } + + static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out, + const InType32BitIndex& in, const Array32Bit& offsets, + const Array32Bit& extents) { + out.device(dev) = in.slice(offsets, extents); + } +}; + +#define INSTANTIATION(FUNCTOR, TYPE) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenSlice, bool); +INSTANTIATION(EigenSlice, int); +INSTANTIATION(EigenSlice, int8_t); +INSTANTIATION(EigenSlice, uint8_t); +INSTANTIATION(EigenSlice, int16_t); +INSTANTIATION(EigenSlice, int64_t); +INSTANTIATION(EigenSlice, float); +INSTANTIATION(EigenSlice, double); +INSTANTIATION(EigenSlice, platform::float16); +INSTANTIATION(EigenSlice, platform::bfloat16); +INSTANTIATION(EigenSlice, platform::complex); +INSTANTIATION(EigenSlice, platform::complex); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/slice.cu b/paddle/fluid/operators/eigen/slice.cu new file mode 100644 index 0000000000000000000000000000000000000000..dc51fa722202bb2d8b7fb168255a13916f3dc157 --- /dev/null +++ b/paddle/fluid/operators/eigen/slice.cu @@ -0,0 +1,67 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenSlice { + using Array = Eigen::DSizes; + using Array32Bit = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + + static void Eval(const Eigen::GpuDevice& dev, OutType out, const InType& in, + const Array& offsets, const Array& extents) { + out.device(dev) = in.slice(offsets, extents); + } + + static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out, + const InType32BitIndex& in, const Array32Bit& offsets, + const Array32Bit& extents) { + out.device(dev) = in.slice(offsets, extents); + } +}; + +#define INSTANTIATION(FUNCTOR, TYPE) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenSlice, int); +INSTANTIATION(EigenSlice, int64_t); +INSTANTIATION(EigenSlice, float); +INSTANTIATION(EigenSlice, double); +INSTANTIATION(EigenSlice, platform::float16); +INSTANTIATION(EigenSlice, platform::bfloat16); +INSTANTIATION(EigenSlice, platform::complex); +INSTANTIATION(EigenSlice, platform::complex); +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index b551629169deed66a1a79636287569995726c4be..67e2e3a1e96772c7508724c1cb21cf670bb84e31 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -20,8 +20,8 @@ limitations under the License. */ namespace paddle { namespace platform { -struct complex128; -struct complex64; +template +struct complex; } // namespace platform } // namespace paddle @@ -135,9 +135,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, + paddle::platform::complex>, ops::ElementwiseAddKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, @@ -145,9 +145,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, + paddle::platform::complex>, ops::ElementwiseAddGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_add_grad_grad, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseAddDoubleGradKernel); + paddle::platform::complex>); // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. @@ -178,9 +178,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, + paddle::platform::complex>, ops::ElementwiseAddKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_add) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 5c444e752e797571e525f9f4b0319146988c7683..aff0cb281642ecf9d9ee62890474ac87841c5e9a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -29,25 +28,28 @@ namespace operators { 1. For Unary Op, the length of input array is 1, e.g. Relu: return args[0] > 0 ? args[0] : 0; 2. For Binary Op, the length of input array is 2, - e.g. Add: return args[0] + args[1]; + e.g. Add: return args[0] expr args[1]; */ template struct CudaAddFunctor { - __device__ __forceinline__ T operator()(const T* args) const { + inline HOSTDEVICE T operator()(const T* args) const { return args[0] + args[1]; } }; template -struct SameDimsElemwiseAdd { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { - std::vector ins = {x, y}; - std::vector outs = {z}; - LaunchElementwiseCudaKernel( - ctx.template device_context(), ins, &outs, - CudaAddFunctor()); +class ElementwiseAddKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaAddFunctor()); } }; @@ -132,8 +134,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel>, + ops::ElementwiseAddKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, ops::ElementwiseAddGradKernel, @@ -141,8 +143,10 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel>, + ops::ElementwiseAddGradKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_add_grad_grad, ops::ElementwiseAddDoubleGradKernel, @@ -151,9 +155,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, + plat::complex>, ops::ElementwiseAddDoubleGradKernel); + plat::complex>); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, @@ -161,5 +165,5 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel>, + ops::ElementwiseAddKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index abea9da9423553e177581a30c02fe73dc50369c6..a469ebbaec2edc9fadf0992412ef7d3b23d483e6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -20,11 +20,13 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef __NVCC__ #include #include #include "cub/cub.cuh" + #endif #ifdef __HIPCC__ #include @@ -38,9 +40,10 @@ namespace paddle { namespace operators { template -void default_elementwise_add(const framework::ExecutionContext &ctx, - const framework::Tensor *x, - const framework::Tensor *y, framework::Tensor *z) { +void LaunchBroadcastElementwiseCpuKernel(const framework::ExecutionContext &ctx, + const framework::Tensor *x, + const framework::Tensor *y, + framework::Tensor *z) { int axis = ctx.Attr("axis"); auto x_dims = x->dims(); auto y_dims = y->dims(); @@ -68,12 +71,11 @@ class ElementwiseAddKernel : public framework::OpKernel { auto *y = ctx.Input("Y"); auto *z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - auto dims_equal = x->dims() == y->dims(); - if (dims_equal) { - SameDimsElemwiseAdd same_dims_add; - same_dims_add(ctx, x, y, z); + if (x->dims() == y->dims()) { + SameDimsElemwiseAdd LaunchElementwiseCpuKernel; + LaunchElementwiseCpuKernel(ctx, x, y, z); } else { - default_elementwise_add(ctx, x, y, z); + LaunchBroadcastElementwiseCpuKernel(ctx, x, y, z); } } }; @@ -459,8 +461,8 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel { GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); ddout->mutable_data(ctx.GetPlace()); - default_elementwise_add(ctx, &ddx_safe, &ddy_safe, - ddout); + LaunchBroadcastElementwiseCpuKernel(ctx, &ddx_safe, + &ddy_safe, ddout); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index 3768748931ded2a2541484bef2c8c37e72adda13..72d7e318d7b0526750ba0153c57e054247624f13 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -32,7 +32,7 @@ class ElementwiseAddNPUKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); @@ -82,8 +82,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { } reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); + const auto& runner = + NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); runner.Run(stream); tmp_dout = &reduced_dout; } @@ -96,8 +97,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { } } if (axes.size() != 0) { - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, - {{"axes", axes}, {"keep_dims", true}}); + const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { framework::TensorCopy( @@ -123,8 +124,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { } reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); + const auto& runner = + NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); runner.Run(stream); tmp_dout = &reduced_dout; } @@ -138,8 +140,8 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { } if (axes.size() != 0) { dy->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy}, - {{"axes", axes}, {"keep_dims", true}}); + const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy}, + {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { framework::TensorCopy( diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc index 8d99aa2798568f507fceaf33772e85a81fd23b67..2e902bd277b1e4d016d0c3190579c409c8d361f3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_xpu.cc @@ -122,44 +122,65 @@ class ElementwiseAddGradXPUKernel : public ElemwiseGradKernel { axis)); std::vector x_dims_vec(max_dim, 1); std::vector y_dims_vec(max_dim, 1); + int x_len = 1; + int y_len = 1; if (x_dims.size() == max_dim) { for (int i = 0; i < max_dim; i++) { x_dims_vec[i] = x_dims[i]; + x_len *= x_dims_vec[i]; } } else { for (int i = 0; i < x_dims.size(); i++) { x_dims_vec[i + axis] = x_dims[i]; + x_len *= x_dims_vec[i]; } } if (y_dims.size() == max_dim) { for (int i = 0; i < max_dim; i++) { y_dims_vec[i] = y_dims[i]; + y_len *= y_dims_vec[i]; } } else { for (int i = 0; i < y_dims.size(); i++) { y_dims_vec[i + axis] = y_dims[i]; + y_len *= y_dims_vec[i]; } } + const T* dz_data = dz->data(); + framework::Tensor dx_local_tensor; + framework::Tensor dy_local_tensor; + bool need_wait = false; T* dx_data = nullptr; T* dy_data = nullptr; if (dx) { dx_data = dx->mutable_data(ctx.GetPlace()); + } else { + dx_data = + dx_local_tensor.mutable_data(ctx.GetPlace(), x_len * sizeof(T)); + need_wait = true; } if (dy) { dy_data = dy->mutable_data(ctx.GetPlace()); + } else { + dy_data = + dy_local_tensor.mutable_data(ctx.GetPlace(), y_len * sizeof(T)); + need_wait = true; } auto& dev_ctx = ctx.template device_context(); - int ret = xpu::broadcast_add_grad(dev_ctx.x_context(), dx_data, dx_data, - dx_data, dz->data(), dy_data, - dx_data, x_dims_vec, y_dims_vec); + int ret = xpu::broadcast_add_grad(dev_ctx.x_context(), dz_data, dz_data, + dz_data, dz_data, dy_data, dx_data, + x_dims_vec, y_dims_vec); PADDLE_ENFORCE_EQ( ret, xpu::SUCCESS, platform::errors::External( "XPU kernel Elementwise occur error in XPUElementwise error code ", ret, XPUAPIErrorMsg[ret])); + if (need_wait && dev_ctx.x_context()->xpu_stream) { + dev_ctx.Wait(); + } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 0252e6dfff5d755cdc9ded56df4dc77f1c542fc0..9a899ec11b4c17cadd836c5959ca7e4287e2dbd2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -17,8 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -135,9 +134,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, + paddle::platform::complex>, ops::ElementwiseDivKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, @@ -145,9 +144,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + paddle::platform::complex>, ops::ElementwiseDivGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_div_grad_grad, @@ -160,9 +159,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseDivDoubleGradKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_div) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 0cf9294c9de67fe4e7f2f32ff96c53586c8e860b..8853fd609f77c968c9b1758e951e6f9ba39aa10a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_div_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -24,38 +22,37 @@ namespace plat = paddle::platform; namespace paddle { namespace operators { +template +struct CudaDivFunctor { + inline HOSTDEVICE T operator()(const T* args) const { + return args[0] / args[1]; + } +}; + template -struct SameDimsElemwiseDiv { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { - DivRangeFunctor functor(x->data(), y->data(), z->data()); - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, - x->numel()); - for_range(functor); +struct CudaDivFunctor::value>> { + inline HOSTDEVICE T operator()(const T* args) const { + PADDLE_ENFORCE(args[1] != 0, + "Invalid Argument Error: Integer division by zero " + "encountered in divide. Please check the input value."); + return args[0] / args[1]; } }; -template <> -struct SameDimsElemwiseDiv { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { - auto size = x->numel(); - dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) / - PADDLE_CUDA_THREAD_SIZE, - 1); - dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); - const half* x2 = - reinterpret_cast(x->data()); - const half* y2 = - reinterpret_cast(y->data()); - half* z2 = reinterpret_cast(z->data()); - SameDimsElemwiseDivCUDAKernel<<< - grid_size, block_size, 0, - ctx.template device_context().stream()>>>( - x2, y2, z2, size); +template +class ElementwiseDivKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaDivFunctor()); } }; @@ -76,18 +73,21 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y, } template <> -__global__ void SimpleElemwiseDivGradCUDAKernel( - const paddle::platform::complex64* x, const paddle::platform::complex64* y, - const paddle::platform::complex64* out, - const paddle::platform::complex64* dout, int64_t size, - paddle::platform::complex64* dx, paddle::platform::complex64* dy) { +__global__ void +SimpleElemwiseDivGradCUDAKernel>( + const paddle::platform::complex* x, + const paddle::platform::complex* y, + const paddle::platform::complex* out, + const paddle::platform::complex* dout, int64_t size, + paddle::platform::complex* dx, + paddle::platform::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - paddle::platform::complex64 o = dout[col]; - paddle::platform::complex64 y_conj(y[col].real, -y[col].imag); - paddle::platform::complex64 out_div_y_conj((out[col] / y[col]).real, - -(out[col] / y[col]).imag); + paddle::platform::complex o = dout[col]; + paddle::platform::complex y_conj(y[col].real, -y[col].imag); + paddle::platform::complex out_div_y_conj((out[col] / y[col]).real, + -(out[col] / y[col]).imag); dx[col] = o / y_conj; dy[col] = -o * out_div_y_conj; col += blockDim.x * gridDim.x; @@ -95,19 +95,21 @@ __global__ void SimpleElemwiseDivGradCUDAKernel( } template <> -__global__ void SimpleElemwiseDivGradCUDAKernel( - const paddle::platform::complex128* x, - const paddle::platform::complex128* y, - const paddle::platform::complex128* out, - const paddle::platform::complex128* dout, int64_t size, - paddle::platform::complex128* dx, paddle::platform::complex128* dy) { +__global__ void +SimpleElemwiseDivGradCUDAKernel>( + const paddle::platform::complex* x, + const paddle::platform::complex* y, + const paddle::platform::complex* out, + const paddle::platform::complex* dout, int64_t size, + paddle::platform::complex* dx, + paddle::platform::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - paddle::platform::complex128 o = dout[col]; - paddle::platform::complex128 y_conj(y[col].real, -y[col].imag); - paddle::platform::complex128 out_div_y_conj((out[col] / y[col]).real, - -(out[col] / y[col]).imag); + paddle::platform::complex o = dout[col]; + paddle::platform::complex y_conj(y[col].real, -y[col].imag); + paddle::platform::complex out_div_y_conj((out[col] / y[col]).real, + -(out[col] / y[col]).imag); dx[col] = o / y_conj; dy[col] = -o * out_div_y_conj; col += blockDim.x * gridDim.x; @@ -145,9 +147,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, ops::ElementwiseDivKernel, + paddle::platform::complex>, ops::ElementwiseDivKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, ops::ElementwiseDivGradKernel, @@ -157,9 +159,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, ops::ElementwiseDivGradKernel, + paddle::platform::complex>, ops::ElementwiseDivGradKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_div_grad_grad, ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseDivDoubleGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 0be8d934b17af7e367eefa2e4c5319f8cb1974f4..a0b9633acb2e5956754d07c53bcdcea7b2896c07 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -74,23 +74,13 @@ struct DivGradDX { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } }; -template <> -struct DivGradDX { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 y_conj(y.real, -y.imag); - return dout / y_conj; - } -}; - -template <> -struct DivGradDX { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 y_conj(y.real, -y.imag); +template +struct DivGradDX> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex y_conj(y.real, -y.imag); return dout / y_conj; } }; @@ -102,23 +92,13 @@ struct DivGradDY { } }; -template <> -struct DivGradDY { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 out_div_y_conj((out / y).real, -(out / y).imag); - return -dout * out_div_y_conj; - } -}; - -template <> -struct DivGradDY { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 out_div_y_conj((out / y).real, +template +struct DivGradDY> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex out_div_y_conj((out / y).real, -(out / y).imag); return -dout * out_div_y_conj; } diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc index 8852f3a419adc51d311178175fd6f71a8c628540..4f3da27f4a67379624f5b5a66840bbc0cbac4f17 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op_npu.cc @@ -40,7 +40,7 @@ class ElementwiseDivNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Div", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Div", {*x, *y}, {*out}, {}); runner.Run(stream); } }; @@ -65,46 +65,47 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel { Tensor y_power(y->type()); y_power.mutable_data(y->dims(), place); - auto y_power_runner = NpuOpRunner("Power", {*y}, {y_power}, - {{"power", static_cast(-1)}}); - y_power_runner.Run(stream); + const auto& runner_y_power = NpuOpRunner( + "Power", {*y}, {y_power}, {{"power", static_cast(-1)}}); + runner_y_power.Run(stream); if (dx) { dx->mutable_data(place); Tensor tensor_zeros(x->type()); tensor_zeros.mutable_data(x->dims(), place); - auto tensor_zeros_runner = + const auto& runner_tensor_zeros = NpuOpRunner("ZerosLike", {*x}, {tensor_zeros}, {}); - tensor_zeros_runner.Run(stream); + runner_tensor_zeros.Run(stream); Tensor x_zero(paddle::framework::proto::VarType::BOOL); x_zero.mutable_data(x->dims(), place); - auto x_zero_runner = + const auto& runner_x_zero = NpuOpRunner("Equal", {*x, tensor_zeros}, {x_zero}, {}); - x_zero_runner.Run(stream); + runner_x_zero.Run(stream); Tensor x_nozero(paddle::framework::proto::VarType::BOOL); x_nozero.mutable_data(x->dims(), place); - auto x_nozero_runner = + const auto& runner_x_nonzero = NpuOpRunner("LogicalNot", {x_zero}, {x_nozero}, {}); - x_nozero_runner.Run(stream); + runner_x_nonzero.Run(stream); Tensor x_nozero_f(x->type()); x_nozero_f.mutable_data(x->dims(), place); - auto x_nozero_f_runner = + const auto& runner_x_nonzero_f = NpuOpRunner("Cast", {x_nozero}, {x_nozero_f}, {{"dst_type", static_cast(0)}}); - x_nozero_f_runner.Run(stream); + runner_x_nonzero_f.Run(stream); Tensor x_grad_w(x->type()); x_grad_w.mutable_data(x->dims(), place); - auto x_grad_w_runner = + const auto& runner_x_grad_w = NpuOpRunner("Mul", {x_nozero_f, y_power}, {x_grad_w}, {}); - x_grad_w_runner.Run(stream); + runner_x_grad_w.Run(stream); - auto x_grad_runner = NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {}); - x_grad_runner.Run(stream); + const auto& runner_x_grad = + NpuOpRunner("Mul", {x_grad_w, *dout}, {*dx}, {}); + runner_x_grad.Run(stream); } if (dy) { @@ -112,16 +113,18 @@ class ElementwiseDivGradNPUKernel : public framework::OpKernel { Tensor neg_out(y->type()); neg_out.mutable_data(y->dims(), place); - auto neg_out_runner = NpuOpRunner("Neg", {*out}, {neg_out}, {}); - neg_out_runner.Run(stream); + const auto& runner_neg_out = NpuOpRunner("Neg", {*out}, {neg_out}, {}); + runner_neg_out.Run(stream); Tensor y_grad_w(y->type()); y_grad_w.mutable_data(y->dims(), place); - auto y_grad_w_runner = NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {}); - y_grad_w_runner.Run(stream); + const auto& runner_y_grad_w = + NpuOpRunner("Div", {neg_out, *y}, {y_grad_w}, {}); + runner_y_grad_w.Run(stream); - auto y_grad_runner = NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {}); - y_grad_runner.Run(stream); + const auto& runner_y_grad = + NpuOpRunner("Mul", {y_grad_w, *dout}, {*dy}, {}); + runner_y_grad.Run(stream); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu index 60846d1e8fee1c7f68ac101f18355750c2c15a4d..a0510d95700b27ba360c48f06ac3f99752b993f2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu @@ -12,11 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_floordiv_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" namespace ops = paddle::operators; namespace plat = paddle::platform; +namespace paddle { +namespace operators { + +template +struct CudaFloorDivFunctor { + inline HOSTDEVICE T operator()(const T argv[]) const { + PADDLE_ENFORCE(argv[1] != 0, + "InvalidArgument: divide by zero " + "encountered in floor-divide ops, please check.\n"); + return static_cast(std::trunc(argv[0] / argv[1])); + } +}; + +template +class ElementwiseFloorDivKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaFloorDivFunctor()); + } +}; + +} // namespace operators +} // namespace paddle + REGISTER_OP_CUDA_KERNEL( elementwise_floordiv, ops::ElementwiseFloorDivKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h index 06eb0b1cc851082447ba2cdc1ffbbc895eb4cf70..bc3c2994c847cb65fb6b476c2bbf8076edfffc1d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc index da0116114747fa2e44045b75f3bd9bd0dc73d980..d97c04f10c497870cedbd7c42616ddf6c3431311 100644 --- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op_npu.cc @@ -37,7 +37,7 @@ class ElementwiseFloorDivNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("FloorDiv", {*x, *y}, {*out}, {}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu index 5d086a1b29febd8e57507eced7683f414ca34e07..d4b5d98d5b0b345119f833e5a684d8f0b6e1f310 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu @@ -12,9 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_max_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" namespace ops = paddle::operators; +namespace paddle { +namespace operators { + +template +struct CudaMaxFunctor { + inline HOSTDEVICE T operator()(const T* args) const { + return (args[0] > args[1] ? args[0] : args[1]); + } +}; + +template +class ElementwiseMaxKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaMaxFunctor()); + } +}; + +} // namespace operators +} // namespace paddle + REGISTER_OP_CUDA_KERNEL( elementwise_max, ops::ElementwiseMaxKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc index 3cdb6420e8ee1d159ecd525ab6a2360544ca5323..a616d0bc9d156453c5ce09403fb4dbc27dc133e9 100644 --- a/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_max_op_npu.cc @@ -40,7 +40,7 @@ class ElementwiseMaxNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*out}, {}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cu b/paddle/fluid/operators/elementwise/elementwise_min_op.cu index cf93e5a97a3f3110aae907c593f58dbab0f9d090..4a99f7e36705f0d96b200d20e880bebf5b5b2186 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu @@ -12,9 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_min_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" namespace ops = paddle::operators; +namespace paddle { +namespace operators { + +template +struct CudaMinFunctor { + inline HOSTDEVICE T operator()(const T* args) const { + return (args[0] > args[1] ? args[1] : args[0]); + } +}; + +template +class ElementwiseMinKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaMinFunctor()); + } +}; + +} // namespace operators +} // namespace paddle + REGISTER_OP_CUDA_KERNEL( elementwise_min, ops::ElementwiseMinKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc index 987c250d651475d44da7e2ebf88222b74e5b5af0..48ac3905f32bd90c8d495d7bae37b0a5cc2c15f0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc @@ -40,7 +40,7 @@ class ElementwiseMinNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu index 92991ab3a0a24c0969a403c2e2e2d1b1cb950d2f..bb49fdbf12dfa36ae2127eccc1c189939bda9a2e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu @@ -12,13 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_mod_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; namespace plat = paddle::platform; +namespace paddle { +namespace operators { + +template +struct CudaModFunctor { + inline HOSTDEVICE T operator()(const T* args) const { + T res = args[0] % args[1]; + + // Accoding to #PR26732: in dividen % divsor + // remainder shall have the same sign as divsor. + if ((res != 0) && ((args[1] ^ res) < 0)) res += args[1]; + return res; + } +}; + +template +struct CudaModFunctor< + T, typename std::enable_if_t::value>> { + inline HOSTDEVICE T operator()(const T* args) const { + T res = fmod(args[0], args[1]); + + // Accoding to #PR26732: in dividen % divsor + // remainder shall have the same sign as divsor. + if ((res != 0) && ((res < 0) != (args[1] < 0))) res += args[1]; + return res; + } +}; + +template +class ElementwiseModKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaModFunctor()); + } +}; + +} // namespace operators +} // namespace paddle + REGISTER_OP_CUDA_KERNEL( elementwise_mod, ops::ElementwiseModKernel, ops::ElementwiseModKernel, - ops::ElementwiseModFPKernel, - ops::ElementwiseModFPKernel); + ops::ElementwiseModKernel, + ops::ElementwiseModKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h index 87e940e2ed6319c4f2957cd846735adb210cd23d..03884f2a45883bbb55bf2b2655636bb003084147 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 6bf296f0e0b57aaab6e16083a35eab5ec80613ef..0045f00ecc6c25ca700cb8bbdca510fc7f705b8e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -16,8 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -134,9 +133,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + paddle::platform::complex>, ops::ElementwiseMulKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradKernel, @@ -144,9 +143,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + paddle::platform::complex>, ops::ElementwiseMulGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad_grad, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseMulDoubleGradKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_mul) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index e01b5eb5fb73d9aca7de318276014f29576040a9..adcc18f837e670ff54459be8f47c97977269a439 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -25,37 +25,26 @@ namespace paddle { namespace operators { template -struct SameDimsElemwiseMul { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { - MulRangeFunctor functor(x->data(), y->data(), z->data()); - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, - x->numel()); - for_range(functor); +struct CudaMulFunctor { + inline HOSTDEVICE T operator()(const T* args) const { + return args[0] * args[1]; } }; -template <> -struct SameDimsElemwiseMul { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { - auto size = x->numel(); - dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) / - PADDLE_CUDA_THREAD_SIZE, - 1); - dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); - const half* x2 = - reinterpret_cast(x->data()); - const half* y2 = - reinterpret_cast(y->data()); - half* z2 = reinterpret_cast(z->data()); - SameDimsElemwiseMulCUDAKernel<<< - grid_size, block_size, 0, - ctx.template device_context().stream()>>>( - x2, y2, z2, size); +template +class ElementwiseMulKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + framework::Tensor x_for_selectedrows; + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs, &x_for_selectedrows); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaMulFunctor()); } }; @@ -76,31 +65,31 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y, } template <> -__global__ void SimpleElemwiseMulGradCUDAKernel( - const plat::complex64* x, const plat::complex64* y, - const plat::complex64* out, const plat::complex64* dout, int64_t size, - plat::complex64* dx, plat::complex64* dy) { +__global__ void SimpleElemwiseMulGradCUDAKernel>( + const plat::complex* x, const plat::complex* y, + const plat::complex* out, const plat::complex* dout, + int64_t size, plat::complex* dx, plat::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - plat::complex64 o = dout[col]; - dx[col] = plat::complex64(y[col].real, -y[col].imag) * o; - dy[col] = plat::complex64(x[col].real, -x[col].imag) * o; + plat::complex o = dout[col]; + dx[col] = plat::complex(y[col].real, -y[col].imag) * o; + dy[col] = plat::complex(x[col].real, -x[col].imag) * o; col += blockDim.x * gridDim.x; } } template <> -__global__ void SimpleElemwiseMulGradCUDAKernel( - const plat::complex128* x, const plat::complex128* y, - const plat::complex128* out, const plat::complex128* dout, int64_t size, - plat::complex128* dx, plat::complex128* dy) { +__global__ void SimpleElemwiseMulGradCUDAKernel>( + const plat::complex* x, const plat::complex* y, + const plat::complex* out, const plat::complex* dout, + int64_t size, plat::complex* dx, plat::complex* dy) { int col = blockIdx.x * blockDim.x + threadIdx.x; while (col < size) { - plat::complex128 o = dout[col]; - dx[col] = plat::complex128(y[col].real, -y[col].imag) * o; - dy[col] = plat::complex128(x[col].real, -x[col].imag) * o; + plat::complex o = dout[col]; + dx[col] = plat::complex(y[col].real, -y[col].imag) * o; + dy[col] = plat::complex(x[col].real, -x[col].imag) * o; col += blockDim.x * gridDim.x; } } @@ -133,8 +122,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel>, + ops::ElementwiseMulKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, ops::ElementwiseMulGradKernel, @@ -142,8 +131,10 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel>, + ops::ElementwiseMulGradKernel>); REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad_grad, ops::ElementwiseMulDoubleGradKernel, @@ -152,6 +143,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + plat::complex>, ops::ElementwiseMulDoubleGradKernel); + plat::complex>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 46a00268e4134a1a797954a6d61cfcf0d88f9b79..a734f891a9d9e83592156442e48215a93af3a920 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -126,29 +126,18 @@ class ElementwiseMulKernel : public framework::OpKernel { } } }; - template struct MulGradDX { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } }; -template <> -struct MulGradDX { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 y_conj(y.real, -y.imag); - return dout * y_conj; - } -}; - -template <> -struct MulGradDX { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 y_conj(y.real, -y.imag); +template +struct MulGradDX> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex y_conj(y.real, -y.imag); return dout * y_conj; } }; @@ -158,23 +147,13 @@ struct MulGradDY { HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } }; -template <> -struct MulGradDY { - HOSTDEVICE paddle::platform::complex64 operator()( - paddle::platform::complex64 x, paddle::platform::complex64 y, - paddle::platform::complex64 out, paddle::platform::complex64 dout) const { - paddle::platform::complex64 x_conj(x.real, -x.imag); - return dout * x_conj; - } -}; - -template <> -struct MulGradDY { - HOSTDEVICE paddle::platform::complex128 operator()( - paddle::platform::complex128 x, paddle::platform::complex128 y, - paddle::platform::complex128 out, - paddle::platform::complex128 dout) const { - paddle::platform::complex128 x_conj(x.real, -x.imag); +template +struct MulGradDY> { + HOSTDEVICE paddle::platform::complex operator()( + paddle::platform::complex x, paddle::platform::complex y, + paddle::platform::complex out, + paddle::platform::complex dout) const { + paddle::platform::complex x_conj(x.real, -x.imag); return dout * x_conj; } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc index 08df6d4e27af0a79123f26ad2064ee0203cc1b28..47aa7e2521f76abe0bbbdf4c9adc4f02b43434ff 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc @@ -41,7 +41,7 @@ class ElementwiseMulNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); runner.Run(stream); } }; @@ -65,14 +65,14 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel { if (dx) { dx->mutable_data(place); - auto dx_runner = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); - dx_runner.Run(stream); + const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); + runner_dx.Run(stream); } if (dy) { dy->mutable_data(place); - auto dy_runner = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); - dy_runner.Run(stream); + const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); + runner_dy.Run(stream); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..541ff9aacfc46247e1dee1b6fa6b1c523a9c470b --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -0,0 +1,533 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.1 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.1 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" + +namespace paddle { +namespace operators { + +struct DimensionsTransform { + using DimVector = std::vector; + typedef void (*MergeFunctor)(bool &, std::vector &, DimVector &, + int, int); + int64_t dim_size; + DimVector out_dims; + std::vector in_dims; + + private: + // To compensate the lackage of input_tensors` dimension with input variable + // 'axis' + void InputDimensionsExtend(int N, int axis) { + for (auto &in_dim : in_dims) { + int64_t in_idx = 0; + if (in_dim.size() < dim_size) { + DimVector tmp_dim(dim_size, 1); + do { + if (in_dim[in_idx] == out_dims[axis] || in_dim[in_idx] == 1) { + tmp_dim[axis] = in_dim[in_idx]; + in_idx++; + axis++; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The %dth dimension of input tensor is expected to be equal " + "with" + "the %dth dimension of output tensor %d or 1, but recieved " + "%d.\n", + in_idx + 1, axis + 1, out_dims[axis], in_dim[in_idx])); + } + } while (in_idx < in_dim.size()); + in_dim.resize(dim_size); + std::copy(tmp_dim.begin(), tmp_dim.end(), in_dim.begin()); + } else { + do { + if (in_dim[in_idx] == out_dims[in_idx] || in_dim[in_idx] == 1) { + in_idx++; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The %dth dimension of input tensor is expected to be equal " + "with" + "the %dth dimension of output tensor %d or 1, but recieved " + "%d.\n", + in_idx + 1, in_idx + 1, out_dims[in_idx], in_dim[in_idx])); + } + } while (in_idx < dim_size); + } + std::reverse(in_dim.begin(), in_dim.end()); + } + std::reverse(out_dims.begin(), out_dims.end()); + } + + template + __inline__ void MergeDimensions(MergeFunctor merge_func, int N) { + auto VectorReorganise = [](DimVector *vec, int l_idx, int m_idx) { + (*vec)[m_idx - 1] = + std::accumulate(vec->begin() + l_idx, vec->begin() + m_idx, 1, + std::multiplies()); + vec->erase(vec->begin() + l_idx, vec->begin() + m_idx - 1); + }; + + int64_t i = 0; + while (i < dim_size) { + int cnt = 0; + int low_idx = i; + bool equal = true; + do { + merge_func(equal, in_dims, out_dims, i, N); + if (equal) { + i++; + cnt++; + } else { + break; + } + } while (i < dim_size); + + if (cnt > 1) { + for (auto &in_dim : in_dims) { + VectorReorganise(&in_dim, low_idx, i); + } + VectorReorganise(&out_dims, low_idx, i); + dim_size -= --cnt; + i -= cnt; + } else if (cnt < 1) { + i++; + } + } + } + + public: + explicit DimensionsTransform( + const std::vector &ins, + const framework::DDim &dims, int axis) { + const int N = ins.size(); + dim_size = dims.size(); + out_dims = framework::vectorize(dims); + in_dims.resize(N); + for (int j = 0; j < N; ++j) { + in_dims[j] = framework::vectorize(ins[j]->dims()); + } + InputDimensionsExtend(N, axis); + + auto merge_sequential_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, int i, int num) { + for (int j = 1; j < num; ++j) { + equal = (in_dims[0][i] == in_dims[j][i]) ? true : false; + } + }; + auto merge_sequential_one_dims = [](bool &equal, + std::vector &in_dims, + DimVector &out, int i, int num) { + equal = in_dims[0][i] == 1; + if (equal) { + for (int j = 1; j < num; ++j) { + equal = in_dims[j][i] == out[i]; + } + } + }; + // To Merge the dimensions of input_tensors while the consequtive + // equal-dimensions appears. + MergeFunctor merge_ptr = merge_sequential_dims; + MergeDimensions(merge_ptr, N); + + int min_idx = 0; + int min_val = std::accumulate(in_dims[0].begin(), in_dims[0].end(), 1, + std::multiplies()); + for (int j = 1; j < N; ++j) { + int temp = std::accumulate(in_dims[j].begin(), in_dims[j].end(), 1, + std::multiplies()); + min_val = min_val > temp ? temp : min_val; + min_idx = min_val == temp ? j : min_idx; + } + std::swap(in_dims[0], in_dims[min_idx]); + + // To Merge the dimension of input_tensors while the consequtive + // 1-value-dimensions appears. + merge_ptr = merge_sequential_one_dims; + MergeDimensions(merge_ptr, N); + std::swap(in_dims[min_idx], in_dims[0]); + } +}; + +struct StridesCalculation { + std::vector> strides; + std::vector divmoders; + + private: + // To calculate the strides of each input_tensor. + __inline__ void CalculateStrides( + int N, int dim_size, const std::vector> &in_dims) { + for (int j = 0; j < N; ++j) { + for (int i = 0; i < dim_size; ++i) { + strides[j][i] = in_dims[j][i] == 1 ? 0 : strides[j][i]; + strides[j][i] = + (i != 0 && strides[j][i] != 0) + ? std::accumulate(in_dims[j].begin(), in_dims[j].begin() + i, 1, + std::multiplies()) + : strides[j][i]; + } + } + } + + public: + explicit StridesCalculation(const int64_t &dim_size, + const std::vector> &in_dims, + const std::vector &out_dims) { + const auto N = in_dims.size(); + divmoders.resize(dim_size); + strides.resize(N, std::vector(dim_size, 1)); + + for (int i = 0; i < dim_size; ++i) { + divmoders[i] = FastDivMod(out_dims[i]); + } + CalculateStrides(N, dim_size, in_dims); + } +}; + +template +struct BroadcastArgsWarpper { + using InVecType = CudaAlignedVector; + using OutVecType = CudaAlignedVector; + + OutT *out_data; + OutVecType *vec_out_data; + const InT *__restrict__ in_data[ET]; + const InVecType *__restrict__ vec_in_data[ET]; + bool no_broadcast[ET]; + FastDivMod divmoders[kDims]; + uint32_t strides[ET][framework::DDim::kMaxRank]; + uint32_t scalar_cal_offset; + Functor func; + + HOSTDEVICE BroadcastArgsWarpper( + const std::vector &ins, framework::Tensor *out, + int scalar_cal_offset, Functor func, + const StridesCalculation &offset_calculator) + : scalar_cal_offset(scalar_cal_offset), func(func) { + for (int j = 0; j < ET; ++j) { + in_data[j] = ins[j]->data(); + vec_in_data[j] = reinterpret_cast(in_data[j]); + no_broadcast[j] = ins[j]->dims() == out->dims() ? true : false; + memcpy(strides[j], offset_calculator.strides[j].data(), + kDims * sizeof(uint32_t)); + } + out_data = out->data(); + vec_out_data = reinterpret_cast(out_data); + memcpy(divmoders, offset_calculator.divmoders.data(), + kDims * sizeof(FastDivMod)); + } + + __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) { + uint32_t offset = 0; + +#pragma unroll(kDims) + for (int i = 0; i < kDims; ++i) { + auto fast_divmoder = divmoders[i].Divmod(idx); + idx = fast_divmoder.val[0]; + offset += fast_divmoder.val[1] * strides[in_idx][i]; + } + return offset; + } + + __device__ __forceinline__ void LoadVectorizedDataCommon( + InVecType *vector_args, int tid, int idx) { + *vector_args = vec_in_data[idx][tid]; + } + + __device__ __forceinline__ void LoadVectorizedDataByDivmod(InT *scalar_args, + int tid, int idx) { + int index = tid * VecSize; +#pragma unroll(VecSize) + for (int i = 0; i < VecSize; ++i) { + uint32_t offset = GetOffsetByDivmod(index + i, idx); + scalar_args[i] = in_data[idx][offset]; + } + } + + __device__ __forceinline__ void LoadScalarizedDataCommon(InT args[], int tid, + int idx) { + args[idx] = in_data[idx][tid + scalar_cal_offset]; + } + + __device__ __forceinline__ void LoadScalarizedDataByDivmod(InT args[], + int tid, int idx) { + auto offset = GetOffsetByDivmod(tid + scalar_cal_offset, idx); + args[idx] = in_data[idx][offset]; + } + + __device__ __forceinline__ void LoadVectorizedData(InT (*args)[VecSize], + int tid) { +#pragma unroll(ET) + for (int j = 0; j < ET; ++j) { + if (no_broadcast[j]) { + InVecType *vector_args = reinterpret_cast(args[j]); + LoadVectorizedDataCommon(vector_args, tid, j); + } else { + LoadVectorizedDataByDivmod(args[j], tid, j); + } + } + } + + __device__ __forceinline__ void LoadScalarizedData(InT args[], int tid) { +#pragma unroll(ET) + for (int j = 0; j < ET; ++j) { + if (no_broadcast[j]) { + LoadScalarizedDataCommon(args, tid, j); + } else { + LoadScalarizedDataByDivmod(args, tid, j); + } + } + } + + __device__ __forceinline__ void StoreVectorizedData(OutVecType vec_args_out, + int tid) { + vec_out_data[tid] = vec_args_out; + } + + __device__ __forceinline__ void StoreScalarizedData(OutT args_out, int tid) { + out_data[scalar_cal_offset + tid] = args_out; + } +}; + +template +__device__ inline void ScalarizedBroadcastKernelImpl( + BroadcastArgsWarpper broadcast_warpper, int tid) { + InT args[ET]; + OutT args_out; + broadcast_warpper.LoadScalarizedData(args, tid); + +#pragma unroll(ET) + for (int j = 1; j < ET; ++j) { + args_out = broadcast_warpper.func(args); + } + broadcast_warpper.StoreScalarizedData(args_out, tid); +} + +template +__device__ inline void VectorizedBroadcastKernelImpl( + BroadcastArgsWarpper broadcast_warpper, int tid) { + using OutVecType = CudaAlignedVector; + OutVecType args_out; + InT ins[ET]; + InT args[ET][VecSize]; + broadcast_warpper.LoadVectorizedData(args, tid); + +#pragma unroll(VecSize) + for (int i = 0; i < VecSize; ++i) { +#pragma unroll(ET) + for (int j = 0; j < ET; ++j) { + ins[j] = args[j][i]; + } + args_out.val[i] = broadcast_warpper.func(ins); + } + broadcast_warpper.StoreVectorizedData(args_out, tid); +} + +template +__global__ void ElementwiseBroadcastKernel( + BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + // Vectorized calculation of major data whose length is the max multipler of + // VecSize, + // eg: Calcualting the front 1024-length data in total 1027 data once VecSize + // is 4. + if (tid < main_tid) { + VectorizedBroadcastKernelImpl( + broadcast_warpper, tid); + } + // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize. + // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is + // 4. + if (tid < tail_tid) { + ScalarizedBroadcastKernelImpl( + broadcast_warpper, tid); + } +} + +template +void LaunchBroadcastKernelForDifferentDimSize( + const platform::CUDADeviceContext &ctx, + const std::vector &ins, framework::Tensor *out, + int axis, Functor func) { + int numel = out->numel(); + const int threads = 256; + int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; + int main_tid = numel / VecSize; + int tail_tid = numel % VecSize; + int vec_len = main_tid * VecSize; + auto stream = ctx.stream(); + + const auto merge_dims = DimensionsTransform(ins, out->dims(), axis); + const auto offset_calculator = StridesCalculation( + merge_dims.dim_size, merge_dims.in_dims, merge_dims.out_dims); + + switch (merge_dims.dim_size) { + case 1: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + case 2: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + case 3: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + case 4: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + case 5: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + case 6: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + case 7: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + case 8: { + auto broadcast_warpper = + BroadcastArgsWarpper( + ins, out, vec_len, func, offset_calculator); + ElementwiseBroadcastKernel<<>>( + broadcast_warpper, main_tid, tail_tid); + break; + } + default: { + PADDLE_THROW(platform::errors::InvalidArgument( + "The maximum dimension of input tensor is expected to be less than " + "%d, but recieved %d.\n", + merge_dims.dim_size, framework::DDim::kMaxRank)); + } + } +} + +template +void LaunchBroadcastElementwiseCudaKernel( + const platform::CUDADeviceContext &ctx, + const std::vector &ins, + std::vector *outs, int axis, Functor func) { + PADDLE_ENFORCE_EQ(ET, ElementwiseType::kBinary, + platform::errors::InvalidArgument( + "Currently, only Support binary calculation, " + "but received %d input tensors.\n", + static_cast(ET))); + int in_vec_size = 4; + framework::Tensor *out = (*outs)[0]; + for (auto *in : ins) { + auto temp_size = GetVectorizedSizeImpl(in->data()); + in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size) + : in_vec_size; + } + int out_vec_size = GetVectorizedSizeImpl(out->data()); + int vec_size = std::min(out_vec_size, in_vec_size); + + switch (vec_size) { + case 4: { + LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, + axis, func); + break; + } + case 2: { + LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, + axis, func); + break; + } + case 1: { + LaunchBroadcastKernelForDifferentDimSize(ctx, ins, out, + axis, func); + break; + } + default: { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported vectorized size: %d !", vec_size)); + break; + } + } +} + +template +void LaunchElementwiseCudaKernel( + const platform::CUDADeviceContext &cuda_ctx, + const std::vector &ins, + std::vector *outs, int axis, Functor func) { + std::vector dims_size; + bool no_broadcast_flag = true; + for (auto *in : ins) { + no_broadcast_flag = ins[0]->dims() == in->dims(); + dims_size.emplace_back(in->dims().size()); + } + + if (no_broadcast_flag) { + LaunchSameDimsElementwiseCudaKernel(cuda_ctx, ins, outs, + func); + } else { + axis = axis == -1 + ? *std::max_element(dims_size.begin(), dims_size.end()) - + *std::min_element(dims_size.begin(), dims_size.end()) + : axis; + LaunchBroadcastElementwiseCudaKernel(cuda_ctx, ins, outs, + axis, func); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 32e49cf3996f120d2e2a8f909883e0c46f7b1352..cc291ae471386faceefeadb4d022c5538540df02 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -57,9 +57,78 @@ constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; *mod = dividend_copy % divisor; \ } while (0) +#define DIVUP(x, y) (((x) + (y)-1) / (y)) + +#define ROUNDUP(x, y) (DIVUP((x), (y)) * (y)) + namespace paddle { namespace operators { +/* +* Pack input and output tensors into respective vectors with +* consideration of varible X`s class type. +* Input variable X is supported to be whether LoDTensor or +* SelectedRows class type in this package function, once X +* was SelectedRows type, a valid pointer x_for_selectedrows +* is excepted to be passed in from op kernel for acquisition +* of the valid address of LoDTensor created ahead in the function. +*/ +template +int PackTensorsIntoVector(const framework::ExecutionContext &ctx, + std::vector *ins, + std::vector *outs, + framework::Tensor *x_for_selectedrows = nullptr) { + int axis = -1; + auto x_var = ctx.InputVar("X"); + PADDLE_ENFORCE_NOT_NULL( + x_var, platform::errors::InvalidArgument( + "Unable to get input Variable X, Variable name is %s.\n", + ctx.InputName("X"))); + auto *y = ctx.Input("Y"); + framework::Tensor *z; + + if (x_var->IsType()) { + auto *x = ctx.Input("X"); + z = ctx.Output("Out"); + ins->emplace_back(x); + } else if (x_var->IsType()) { + PADDLE_ENFORCE_EQ(y->dims().size() == 1 && y->dims()[0] == 1, true, + platform::errors::InvalidArgument( + "For elementwise_op, if X is Sparse, Y must be " + "scalar. But reveived the size of Y = %d.", + y->dims().size())); + PADDLE_ENFORCE_NOT_NULL( + x_for_selectedrows, + platform::errors::InvalidArgument( + "The parameter x_for_selectedrows is excepted to " + "be valid, once input varible X`s class type is " + "SelectedRows.\n")); + auto &x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); + *x_for_selectedrows = x_sele.value(); + out_sele->set_rows(x_sele.rows()); + out_sele->set_height(x_sele.height()); + out_sele->mutable_value()->Resize(x_sele.value().dims()); + out_sele->mutable_value()->mutable_data(ctx.GetPlace(), + x_for_selectedrows->type()); + z = ctx.Output("Out")->mutable_value(); + ins->emplace_back(x_for_selectedrows); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "X's type[%s] is not supported by elementwise_op. X's type should be " + "LoDTensor or SelectedRows.", + framework::ToTypeName(x_var->Type()))); + } + z->mutable_data(ctx.GetPlace()); + outs->emplace_back(z); + + if (y != nullptr) { + ins->emplace_back(y); + axis = ctx.HasAttr("axis") ? ctx.Attr("axis") : -1; + } + return axis; +} + /* * Out = X ⊙ Y * If Y's shape does not match X' shape, they will be reshaped. @@ -187,6 +256,10 @@ void CommonForwardBroadcastCPU(const framework::Tensor *x, std::vector index_array(max_dim, 0); const T *x_data = x->data(); const T *y_data = y->data(); + PADDLE_ENFORCE_NOT_NULL(x_data, platform::errors::InvalidArgument( + "The input X should not be empty.")); + PADDLE_ENFORCE_NOT_NULL(y_data, platform::errors::InvalidArgument( + "The input Y should not be empty.")); OutType *out_data = z->mutable_data(ctx.GetPlace()); const int out_size = std::accumulate(out_dims_array, out_dims_array + max_dim, @@ -2087,10 +2160,10 @@ template <<>>( @@ -2516,106 +2589,129 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( const T *x, const T *y, const T *intermediate_out, const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { - int j = blockIdx.x; - int i = threadIdx.x; - int tid = threadIdx.x; - T val(0), inter_val(0); - int64_t tmp_out_idx, x_idx, y_idx; + __shared__ T sdata[BLOCK_Y][BLOCK_X]; + size_t idx = threadIdx.x + BLOCK_X * blockIdx.x; + size_t width_stride = gridDim.x * BLOCK_X; + + size_t full_w = ROUNDUP(w, BLOCK_X); + T zero = static_cast(0); - do { - int offset = i * w + j; + for (size_t j = idx; j < full_w; j += width_stride) { + T val(0), inter_val(0); + if (j < w) { + for (size_t i = threadIdx.y; i < h; i += BLOCK_Y) { + size_t offset = i * w + j; - tmp_out_idx = BcastY ? j : offset; - y_idx = BcastY ? j : offset; - x_idx = BcastY ? offset : j; - T x_val = (x == nullptr) ? zero : x[x_idx]; - T y_val = (y == nullptr) ? zero : y[y_idx]; + size_t tmp_out_idx = BcastY ? j : offset; + size_t y_idx = BcastY ? j : offset; + size_t x_idx = BcastY ? offset : j; + T x_val = (x == nullptr) ? zero : x[x_idx]; + T y_val = (y == nullptr) ? zero : y[y_idx]; - if (SameShapeOfIntermediateOutAndOut) { - tmp_out_idx = offset; - } + if (SameShapeOfIntermediateOutAndOut) { + tmp_out_idx = offset; + } - if (dx != nullptr) { - T tmp = UseIntermediateOut + if (dx != nullptr) { + T tmp = + UseIntermediateOut ? dx_op.UseIntermediateOut(x_val, y_val, intermediate_out[tmp_out_idx], out[offset], dout[offset]) : dx_op.Recompute(x_val, y_val, out[offset], dout[offset]); - if (BcastY) { - dx[x_idx] = tmp; - } else { - val += tmp; - } - } - if (dy != nullptr) { - T tmp = UseIntermediateOut + if (BcastY) { + dx[x_idx] = tmp; + } else { + val += tmp; + } + } + if (dy != nullptr) { + T tmp = + UseIntermediateOut ? dy_op.UseIntermediateOut(x_val, y_val, intermediate_out[tmp_out_idx], out[offset], dout[offset]) : dy_op.Recompute(x_val, y_val, out[offset], dout[offset]); - if (BcastY) { - val += tmp; - } else { - dy[y_idx] = tmp; - } - } - if (d_intermediate != nullptr) { - T tmp = UseIntermediateOut - ? dintermediate_op.UseIntermediateOut( - y[y_idx], intermediate_out[tmp_out_idx], out[offset], - dout[offset]) - : dintermediate_op.Recompute(x_val, y_val, out[offset], - dout[offset]); - if (SameShapeOfIntermediateOutAndOut) { - d_intermediate[tmp_out_idx] = tmp; - } else { - inter_val += tmp; + if (BcastY) { + val += tmp; + } else { + dy[y_idx] = tmp; + } + } + if (d_intermediate != nullptr) { + T tmp = UseIntermediateOut + ? dintermediate_op.UseIntermediateOut( + y[y_idx], intermediate_out[tmp_out_idx], + out[offset], dout[offset]) + : dintermediate_op.Recompute(x_val, y_val, out[offset], + dout[offset]); + if (SameShapeOfIntermediateOutAndOut) { + d_intermediate[tmp_out_idx] = tmp; + } else { + inter_val += tmp; + } + } } } - i += ELEMWISE_MAX_BLOCK_DIM; - } while (i < h); + // transpose, for ReduceSum with wrap + sdata[threadIdx.y][threadIdx.x] = val; + __syncthreads(); + val = sdata[threadIdx.x][threadIdx.y]; +#pragma unroll + for (int i = BLOCK_X >> 1; i > 0; i >>= 1) { + // reduce sum with wrap + val += platform::CudaShuffleXorSync(0xFFFFFFFF, val, i); + } - h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; - if (BcastY) { - if (dy) { - val = paddle::platform::reduceSum(val, tid, h); - if (threadIdx.x == 0) { - dy[j] = val; + size_t idx_j = j + threadIdx.y; + if (BcastY) { + if (dy) { + if (threadIdx.x == 0 && (idx_j < w)) dy[idx_j] = val; } - } - } else { - if (dx) { - val = paddle::platform::reduceSum(val, tid, h); - if (threadIdx.x == 0) { - dx[j] = val; + } else { + if (dx) { + if (threadIdx.x == 0 && (idx_j < w)) dx[idx_j] = val; } } - } - if (!SameShapeOfIntermediateOutAndOut) { - if (d_intermediate) { - inter_val = paddle::platform::reduceSum(inter_val, tid, h); - if (threadIdx.x == 0) { - d_intermediate[j] = inter_val; + + if (!SameShapeOfIntermediateOutAndOut) { + if (d_intermediate) { + sdata[threadIdx.y][threadIdx.x] = inter_val; + __syncthreads(); + inter_val = sdata[threadIdx.x][threadIdx.y]; +#pragma unroll + for (int i = BLOCK_X >> 1; i > 0; i >>= 1) { + // reduce sum with wrap + inter_val += platform::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i); + } + if (threadIdx.x == 0 && (idx_j < w)) d_intermediate[idx_j] = inter_val; } } - } + } // end for } template static void FusedElemwiseAndActGradBroadcast1CUDA( - gpuStream_t stream, const T *x, const T *y, const T *intermediate_out, - const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, - DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) { - int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); - int gird_size = w; + const framework::ExecutionContext &ctx, const T *x, const T *y, + const T *intermediate_out, const T *out, const T *dout, int h, int w, + DX_OP dx_op, DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy, + T *d_intermediate) { + gpuStream_t stream = ctx.cuda_device_context().stream(); + + dim3 blocks(BLOCK_X, BLOCK_Y); + int max_gpu_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_gpu_threads / (BLOCK_X * BLOCK_Y), 1); + int theory_block = (w + BLOCK_X - 1) / BLOCK_X; + dim3 grids(std::min(theory_block, max_blocks)); + FusedElemwiseAndActGradBroadcast1CUDAKernel< T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY, - SameShapeOfIntermediateOutAndOut><<>>( + SameShapeOfIntermediateOutAndOut><<>>( x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dintermediate_op, dx, dy, d_intermediate); } @@ -2767,7 +2863,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast( FusedElemwiseAndActGradBroadcast1CUDA( - ctx.template device_context().stream(), x_data, y_data, + ctx, x_data, y_data, intermediate_out == nullptr ? nullptr : intermediate_out->data(), out->data(), dout->data(), h, w, dx_op, dy_op, dintermediate_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 321826ec647c99345ac0769c88ac4ffa2be5b0db..101512e35fdcb77ea3d4cccd210494d228a6bb3c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -14,9 +14,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/fast_divmod.h" #ifdef __HIPCC__ #define ELEMENTWISE_BLOCK_SIZE 256 @@ -29,113 +28,153 @@ namespace operators { enum ElementwiseType { kUnary = 1, kBinary = 2 }; -template -struct alignas(sizeof(T) * Size) CudaAlignedVector { - T val[Size]; -}; +/* +* According to NVIDIA, if number of threads per block is 64/128/256/512, +* cuda performs better. And number of blocks should be greater (at least +* 2x~4x) than number of SMs. Hence, SM count is took into account within +* this function to determine the right number of threads per block. +*/ +inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx, + int64_t numel, int vec_size) { + int threads = ELEMENTWISE_BLOCK_SIZE; + int sm_count = ctx.GetSMCount(); + int active_threads_num = numel / vec_size; + if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about twice of SM, to acquire better performance. + threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 1)); + } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) { + // Round up threads number into an exponential multiple of 2, while number + // of acitve blocks is about 4 times of SM, to acquire better performance. + threads = platform::RoundToPowerOfTwo(active_threads_num / (sm_count << 2)); + } + // Number of threads per block shall be larger than 64. + return std::max(64, threads); +} +/* +* Only the address of input data is the multiplier of 1,2,4, vectorized load +* with corresponding multiplier-value is possible. Moreover, the maximum length +* of vectorized load is 128 bits once. Hence, valid length of vectorized load +* shall be determined under both former constraints. +*/ template int GetVectorizedSizeImpl(const T *pointer) { + constexpr int max_load_bits = 128; + int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); uint64_t address = reinterpret_cast(pointer); + constexpr int vec8 = + std::alignment_of>::value; // NOLINT constexpr int vec4 = std::alignment_of>::value; // NOLINT constexpr int vec2 = std::alignment_of>::value; // NOLINT - if (address % vec4 == 0) { - return 4; + if (address % vec8 == 0) { + /* + * Currently, decide to deal with no more than 4 data once while adopting + * vectorization load/store, if performance test shows that dealing with + * 8 data once in vectorization load/store does get optimized, return code + * below can be changed into " return std::min(8, valid_vec_size); " . + */ + return std::min(4, valid_vec_size); + } else if (address % vec4 == 0) { + return std::min(4, valid_vec_size); } else if (address % vec2 == 0) { - return 2; + return std::min(2, valid_vec_size); + } else { + return 1; } - return 1; } -template +template int GetVectorizedSize(const std::vector &ins, const std::vector &outs) { int vec_size = 4; for (auto iter = ins.begin(); iter != ins.end(); ++iter) { vec_size = - std::min(vec_size, GetVectorizedSizeImpl((*iter)->data())); + std::min(vec_size, GetVectorizedSizeImpl((*iter)->data())); } for (auto iter = outs.begin(); iter != outs.end(); ++iter) { vec_size = - std::min(vec_size, GetVectorizedSizeImpl((*iter)->data())); + std::min(vec_size, GetVectorizedSizeImpl((*iter)->data())); } return vec_size; } -template +template struct ElementwiseDataWrapper { - T *out; - const T *in0; - const T *in1; - __device__ ElementwiseDataWrapper(T *out, const T *in0, - const T *in1 = nullptr) + OutT *out; + const InT *in0; + const InT *in1; + __device__ ElementwiseDataWrapper(OutT *out, const InT *in0, + const InT *in1 = nullptr) : out(out), in0(in0), in1(in1) {} - using VecType = CudaAlignedVector; + using InVecType = CudaAlignedVector; + using OutVecType = CudaAlignedVector; - inline __device__ void load_vector(VecType args[], int idx) { - const VecType *x_vec = reinterpret_cast(in0); + inline __device__ void load_vector(InVecType args[], int idx) { + const InVecType *x_vec = reinterpret_cast(in0); args[0] = x_vec[idx]; if (ET == ElementwiseType::kBinary) { - const VecType *y_vec = reinterpret_cast(in1); + const InVecType *y_vec = reinterpret_cast(in1); args[1] = y_vec[idx]; } } - inline __device__ void load_scalar(T args[], int idx) { + inline __device__ void load_scalar(InT args[], int idx) { args[0] = in0[idx]; if (ET == ElementwiseType::kBinary) { args[1] = in1[idx]; } } - inline __device__ void store_vector(VecType res, int idx) { - VecType *out_vec = reinterpret_cast(out); + inline __device__ void store_vector(OutVecType res, int idx) { + OutVecType *out_vec = reinterpret_cast(out); out_vec[idx] = res; } - inline __device__ void store_scalar(T res, int idx) { out[idx] = res; } + inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; } }; -template -__device__ void VectorizedKernelImpl( - ElementwiseDataWrapper data, Functor func, int tid) { - using VecType = CudaAlignedVector; - VecType ins_vec[ET]; - VecType out_vec; - T *ins_ptr[ET]; - T *out_ptr; +template +__device__ inline void VectorizedKernelImpl( + ElementwiseDataWrapper data, Functor func, + int tid) { + using InVecType = CudaAlignedVector; + using OutVecType = CudaAlignedVector; + InVecType ins_vec[ET]; + OutVecType out_vec; + InT *ins_ptr[ET]; + InT ins[ET]; #pragma unroll for (int i = 0; i < ET; ++i) { - ins_ptr[i] = reinterpret_cast(&(ins_vec[i])); + ins_ptr[i] = reinterpret_cast(&(ins_vec[i])); } - out_ptr = reinterpret_cast(&out_vec); - // load data.load_vector(ins_vec, tid); // compute #pragma unroll for (int i = 0; i < VecSize; ++i) { - T ins[ET]; #pragma unroll for (int j = 0; j < ET; ++j) { ins[j] = ins_ptr[j][i]; } - out_ptr[i] = func(ins); + out_vec.val[i] = func(ins); } - // store data.store_vector(out_vec, tid); } -template -__device__ void ScalarKernelImpl(ElementwiseDataWrapper data, - Functor func, int start, int remain) { - T ins[ET]; - T out; +template +__device__ inline void ScalarKernelImpl( + ElementwiseDataWrapper data, Functor func, + int start, int remain) { + InT ins[ET]; + OutT out; for (int i = 0; i < remain; ++i) { int idx = start + i; @@ -148,14 +187,15 @@ __device__ void ScalarKernelImpl(ElementwiseDataWrapper data, } } -template -__global__ void VectorizedKernel(const T *__restrict__ in0, - const T *__restrict__ in1, T *out, int size, - Functor func) { +template +__global__ void VectorizedKernel(const InT *__restrict__ in0, + const InT *__restrict__ in1, OutT *out, + int size, Functor func) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int remain = size - VecSize * tid; remain = remain > 0 ? remain : 0; - auto data = ElementwiseDataWrapper(out, in0, in1); + auto data = ElementwiseDataWrapper(out, in0, in1); if (remain >= VecSize) { VectorizedKernelImpl(data, func, tid); } else { @@ -163,32 +203,34 @@ __global__ void VectorizedKernel(const T *__restrict__ in0, } } -template -__global__ void ScalarKernel(const T *__restrict__ in0, - const T *__restrict__ in1, T *out, int size, +template +__global__ void ScalarKernel(const InT *__restrict__ in0, + const InT *__restrict__ in1, OutT *out, int size, Functor func) { - auto data = ElementwiseDataWrapper(out, in0, in1); + auto data = ElementwiseDataWrapper(out, in0, in1); int tid = blockIdx.x * blockDim.x + threadIdx.x; int remain = tid < size ? 1 : 0; ScalarKernelImpl(data, func, tid, remain); } -template -void LaunchElementwiseCudaKernel( +template +void LaunchSameDimsElementwiseCudaKernel( const platform::CUDADeviceContext &ctx, const std::vector &ins, std::vector *outs, Functor func) { // calculate the max vec_size for all ins and outs auto size = ins[0]->numel(); - int vec_size = GetVectorizedSize(ins, *outs); - int block_size = ELEMENTWISE_BLOCK_SIZE; + int vec_size = GetVectorizedSize(ins, *outs); + int block_size = GetThreadsConfig(ctx, size, vec_size); int grid_size = ((size + vec_size - 1) / vec_size + block_size - 1) / block_size; - const T *in0 = ins[0]->data(); - const T *in1 = (ET == ElementwiseType::kBinary) ? ins[1]->data() : nullptr; - T *out = (*outs)[0]->data(); + const InT *in0 = ins[0]->data(); + const InT *in1 = + (ET == ElementwiseType::kBinary) ? ins[1]->data() : nullptr; + OutT *out = (*outs)[0]->data(); // cuda kernel auto stream = ctx.stream(); + switch (vec_size) { case 4: VectorizedKernel<<>>( diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu index 320d1e7b38da8e4f77015ef2b7bcc73e5db7675f..5335f274ef126f228694d1bfb23cb15f6da158ee 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu @@ -8,10 +8,52 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h" namespace ops = paddle::operators; +namespace paddle { +namespace operators { + +template +struct CudaPowFunctor { + inline HOSTDEVICE T operator()(const T args[]) const { + return std::pow(args[0], args[1]); + } +}; + +template +struct CudaPowFunctor< + T, typename std::enable_if::value>::type> { + // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and + // it will return a float number like 2.99... , which floor to 2 + // when cast to int by default and it is wrong. + // Use llrint to cast it to the nearest integer, which is 3. + inline HOSTDEVICE T operator()(const T args[]) const { + return std::llrint(std::pow(args[0], args[1])); + } +}; + +template +class ElementwisePowKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaPowFunctor()); + } +}; + +} // namespace operators +} // namespace paddle + REGISTER_OP_CUDA_KERNEL( elementwise_pow, ops::ElementwisePowKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc index 26cc925b869c647d5a02215c8c8621782cdf2303..e0763d769f047a963ea8e4905a9e79e1b583703a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op_npu.cc @@ -40,7 +40,7 @@ class ElementwisePowNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Pow", {*x, *y}, {*out}, {}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index 1951ed7f5da67316a11d0bbc96b902dbf9a4c440..84aa189b89e909f66c994bd765a3d192e393a1ea 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -20,8 +20,8 @@ limitations under the License. */ namespace paddle { namespace platform { -struct complex128; -struct complex64; +template +struct complex; } // namespace platform } // namespace paddle @@ -134,9 +134,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, + paddle::platform::complex>, ops::ElementwiseSubKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, @@ -144,9 +144,9 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + paddle::platform::complex>, ops::ElementwiseSubGradKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad_grad, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseSubDoubleGradKernel); + paddle::platform::complex>); REGISTER_OP_VERSION(elementwise_sub) .AddCheckpoint( diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 192999fd2ac831e85d42a41e5a54754a49f4ddce..da9610243f7c4df3300b3ea8b9137cea84e5c72b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -11,11 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -25,37 +23,25 @@ namespace paddle { namespace operators { template -struct SameDimsElemwiseSub { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { - SubRangeFunctor functor(x->data(), y->data(), z->data()); - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, - x->numel()); - for_range(functor); +struct CudaSubFunctor { + inline HOSTDEVICE T operator()(const T* args) const { + return args[0] - args[1]; } }; -template <> -struct SameDimsElemwiseSub { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z) { - auto size = x->numel(); - dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) / - PADDLE_CUDA_THREAD_SIZE, - 1); - dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); - const half* x2 = - reinterpret_cast(x->data()); - const half* y2 = - reinterpret_cast(y->data()); - half* z2 = reinterpret_cast(z->data()); - SameDimsElemwiseSubCUDAKernel<<< - grid_size, block_size, 0, - ctx.template device_context().stream()>>>( - x2, y2, z2, size); +template +class ElementwiseSubKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + std::vector ins; + std::vector outs; + const auto& cuda_ctx = + ctx.template device_context(); + + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + LaunchElementwiseCudaKernel( + cuda_ctx, ins, &outs, axis, CudaSubFunctor()); } }; @@ -103,9 +89,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, + paddle::platform::complex>, ops::ElementwiseSubKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, ops::ElementwiseSubGradKernel, @@ -115,9 +101,9 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + paddle::platform::complex>, ops::ElementwiseSubGradKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad_grad, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel, + paddle::platform::complex>, ops::ElementwiseSubDoubleGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 4171d2eb9e5e53ea2fff9a2ab7521f2e5c4ae438..426093413276092538c67676abb2c1e9b7f637ed 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once + #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index a6e438f8016e0cd4c8fccee6c664d509b8c170eb..94e78defbbee5d767194dd403a176574008f03ac 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -33,7 +33,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {}); + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {}); auto stream = ctx.template device_context() @@ -84,8 +84,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { } reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); + const auto& runner = + NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); runner.Run(stream); tmp_dout = &reduced_dout; } @@ -98,8 +99,8 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { } } if (axes.size() != 0) { - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, - {{"axes", axes}, {"keep_dims", true}}); + const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { framework::TensorCopy( @@ -127,8 +128,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { } reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); reduced_dout.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); + const auto& runner = + NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); runner.Run(stream); tmp_dout = &reduced_dout; } @@ -144,14 +146,15 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { if (axes.size() != 0) { reduced_dy.Resize(dy->dims()); reduced_dy.mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, - {{"axes", axes}, {"keep_dims", true}}); + const auto& runner = + NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, + {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); tmp_dy = &reduced_dy; } // stage 3, negative - auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); + const auto& runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); runner.Run(stream); } } diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index e5d20893335f702c0188ff7a8deaa2b41b848b85..ddad70a6a5f31ccb974f78ca35f045c59f45b8be 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -47,23 +47,13 @@ class EltwiseMKLDNNKernel : public framework::OpKernel { float scale_o = ctx.Attr("Scale_out"); int axis = ctx.Attr("axis"); - bool is_inplaced = x->IsSharedBufferWith(*z); - - std::string key = is_inplaced - ? platform::CreateKey(dev_ctx, ctx.OutputName("Out"), - x->format(), y->format()) - : ctx.OutputName("Out"); - platform::BinaryMKLDNNHandler handler( BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z, - scale_x, scale_y, scale_o, key); + scale_x, scale_y, scale_o, ctx.OutputName("Out")); const auto src_x_memory = handler.AcquireSrcMemory(x); const auto src_y_memory = handler.AcquireSecondSrcMemory(y); - - // For Inplace src and and dst are the same memory object - const auto dst_memory = - is_inplaced ? src_x_memory : handler.AcquireDstMemory(z); + const auto dst_memory = handler.AcquireDstMemory(z); const auto binary_prim = handler.AcquireForwardPrimitive(); diff --git a/paddle/fluid/operators/erf_op.cc b/paddle/fluid/operators/erf_op.cc index 09cdf4d8b2a0dd3b445dc5215dd86b8b1963196e..f68f670394871114369f8b05b7f958c03d5508d0 100644 --- a/paddle/fluid/operators/erf_op.cc +++ b/paddle/fluid/operators/erf_op.cc @@ -130,3 +130,14 @@ REGISTER_OP_CPU_KERNEL( ops::ErfGradKernel, ops::ErfGradKernel); + +REGISTER_OP_CUDA_KERNEL( + erf, ops::ErfKernel, + ops::ErfKernel, + ops::ErfKernel); +REGISTER_OP_CUDA_KERNEL( + erf_grad, ops::ErfGradKernel, + ops::ErfGradKernel, + ops::ErfGradKernel); diff --git a/paddle/fluid/operators/erf_op.cu b/paddle/fluid/operators/erf_op.cu deleted file mode 100644 index 357b9e79c4e72854549f11ab49735fac65a400be..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/erf_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/erf_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - erf, ops::ErfKernel, - ops::ErfKernel, - ops::ErfKernel); -REGISTER_OP_CUDA_KERNEL( - erf_grad, ops::ErfGradKernel, - ops::ErfGradKernel, - ops::ErfGradKernel); diff --git a/paddle/fluid/operators/erf_op.h b/paddle/fluid/operators/erf_op.h index 08c827df95d9bfa4f01f3c7af9e657b7b3a360a8..4780b2e7f5b28d4a743f6d35046891b30cbefd00 100644 --- a/paddle/fluid/operators/erf_op.h +++ b/paddle/fluid/operators/erf_op.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -35,7 +36,8 @@ class ErfKernel : public framework::OpKernel { auto eigen_in = framework::EigenVector::Flatten(*in); auto& place = *context.template device_context().eigen_device(); - eigen_out.device(place) = eigen_in.erf(); + EigenErf, T>::Eval(place, eigen_out, + eigen_in); } }; @@ -55,8 +57,8 @@ class ErfGradKernel : public framework::OpKernel { auto eigen_dx = framework::EigenVector::Flatten(*dx); auto& place = *context.template device_context().eigen_device(); - eigen_dx.device(place) = - eigen_dout * static_cast(M_2_SQRTPI) * (-(eigen_x.square())).exp(); + EigenErfGrad, T>::Eval(place, eigen_dx, + eigen_x, eigen_dout); } }; diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index bb3a6512d2c8ba3b5f0d643a5ae6d906a00717c3..76d5a203f306b9b9773af50d5de5db7b6c89ae5e 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -11,7 +11,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include #include @@ -65,7 +64,7 @@ class ExpandNPUKernel : public framework::OpKernel { out0->Resize(out_dims); out0->mutable_data(context.device_context().GetPlace()); - auto runner = + const auto& runner = NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", expand_times}}); auto stream = context.template device_context() @@ -82,5 +81,3 @@ REGISTER_OP_NPU_KERNEL( ops::ExpandNPUKernel, ops::ExpandNPUKernel); - -#endif diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 78052179f6be72c39d7d78aab5237ab6beb8c645..583ff157a0d398d801473b6a22c34771261f1f33 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -25,18 +25,19 @@ __global__ void FindAbsMaxKernel(const T* in, const int n, T* out) { int bid = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x; - extern __shared__ T shared_max_data[]; + extern __shared__ char* shared_max_data_tmp[]; + auto shared_max_data = reinterpret_cast(shared_max_data_tmp); if (gridDim.x > 1) { shared_max_data[tid] = T(0); for (int i = bid; i < n; i += blockDim.x * gridDim.x) { - T tmp = fabs(in[i]); + T tmp = abs(in[i]); if (tmp > shared_max_data[tid]) { shared_max_data[tid] = tmp; } } } else { if (bid < n) { - shared_max_data[tid] = fabs(in[bid]); + shared_max_data[tid] = abs(in[bid]); } else { shared_max_data[tid] = T(0); } @@ -73,6 +74,8 @@ struct FindAbsMaxFunctor { }; template struct FindAbsMaxFunctor; +template struct FindAbsMaxFunctor; template __global__ void FindChannelAbsMaxKernelQuantAxis0(const T* in, const int n, @@ -213,13 +216,16 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale, int tid = threadIdx.x; T s = scale[0]; - T inv_s = inverse(s); + T bin_cnt_t = static_cast(bin_cnt); + for (int i = bid; i < n; i += blockDim.x * gridDim.x) { T x = in[i]; - T v = x > s ? s : x; - v = v < -s ? -s : v; - v = bin_cnt * inv_s * v; - out[i] = round(v) * s / bin_cnt; + x = x > s ? s : x; + x = x < -s ? -s : x; + x = (bin_cnt_t / s) * x; + + x = static_cast(round(static_cast(x))); + out[i] = (x * s) / bin_cnt_t; } } @@ -261,9 +267,6 @@ struct ClipAndFakeQuantDequantFunctor { } }; -template struct ClipAndFakeQuantDequantFunctor; - // ChannelClipAndQuantKernel for quant_axis is 0 template __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale, @@ -423,8 +426,10 @@ struct FindMovingAverageAbsMaxFunctor { memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), ctx.stream()); ctx.Wait(); - state = rate * state + 1; - accum = rate * accum + scale; + + T rate_t = static_cast(rate); + state = rate_t * state + static_cast(1.0); + accum = rate_t * accum + scale; scale = accum / state; memory::Copy(gpu_place, out_accum->mutable_data(gpu_place), @@ -527,10 +532,12 @@ template struct ChannelClipFakeQuantDequantFunctor); REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_abs_max, - ops::FakeQuantizeDequantizeAbsMaxKernel); + ops::FakeQuantizeDequantizeAbsMaxKernel, + ops::FakeQuantizeDequantizeAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max, ops::FakeChannelWiseQuantizeAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max, @@ -539,12 +546,15 @@ REGISTER_OP_CUDA_KERNEL( fake_quantize_moving_average_abs_max, ops::FakeQuantizeMovingAverageAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale, - ops::MovingAverageAbsMaxScaleKernel); + ops::MovingAverageAbsMaxScaleKernel, + ops::MovingAverageAbsMaxScaleKernel); REGISTER_OP_CUDA_KERNEL( fake_quantize_dequantize_moving_average_abs_max, - ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel); + ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel, + ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel); REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad, - ops::StrightThroughEstimatorGradKernel); + ops::StrightThroughEstimatorGradKernel, + ops::StrightThroughEstimatorGradKernel); REGISTER_OP_CUDA_KERNEL( fake_channel_wise_quantize_dequantize_abs_max, ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index f35d8b6bbf89f188864e37fb267101333163cd41..d465e77ea1886f7f35549a043951048fb2bcb61d 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -147,16 +147,15 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel); +REGISTER_OP_CPU_KERNEL( + fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel>, + ops::FillConstantKernel>); REGISTER_OP_VERSION(fill_constant) .AddCheckpoint( diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc index e784c20b8b8b4f9fa61b3bcebf481a989d4bb033..a862cda13888ee7086d8ce17511b9851a36d18a6 100644 --- a/paddle/fluid/operators/fill_constant_op.cu.cc +++ b/paddle/fluid/operators/fill_constant_op.cu.cc @@ -15,12 +15,11 @@ limitations under the License. */ #include "paddle/fluid/operators/fill_constant_op.h" namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel); +REGISTER_OP_CUDA_KERNEL( + fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel, + ops::FillConstantKernel>, + ops::FillConstantKernel>); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 46c4ae12036a4a808061a55677e6c433d40035ad..17c7321122b174226010810b9223770ed2b84a7e 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -117,6 +117,9 @@ class FillConstantKernel : public framework::OpKernel { } if (actual_place == 0) { + VLOG(4) << "[CPU] FillConstantKernel" + << ((data_type == framework::proto::VarType::BF16) ? "" + : ""); tensor->mutable_data(platform::CPUPlace(), data_type); math::SetConstant functor; functor(reinterpret_cast(dev_ctx), diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 4ea4c11c478357aa7ca98fc0de4467bae7100a87..2626e6d960f8e952a722eb6a31b995c829610c5e 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -68,8 +68,8 @@ class FillConstantNPUKernel : public framework::OpKernel { FillNpuTensorWithConstant(&tensor_tmp, value); out_var->mutable_data(shape, place); - auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, - {{"dims", framework::vectorize(shape)}}); + const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, + {{"dims", framework::vectorize(shape)}}); runner.Run(stream); } }; diff --git a/paddle/fluid/operators/fill_constant_op_xpu.cc b/paddle/fluid/operators/fill_constant_op_xpu.cc index 16dd4c9292f89a05d58cfc1d821c5a43f45f5add..d55b8e2b81b52f173dc2f8f158a2f42ae7abd7eb 100644 --- a/paddle/fluid/operators/fill_constant_op_xpu.cc +++ b/paddle/fluid/operators/fill_constant_op_xpu.cc @@ -15,11 +15,10 @@ limitations under the License. */ namespace ops = paddle::operators; #ifdef PADDLE_WITH_XPU -REGISTER_OP_XPU_KERNEL(fill_constant, ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel, - ops::FillConstantKernel); +REGISTER_OP_XPU_KERNEL( + fill_constant, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel>, + ops::FillConstantKernel>); #endif diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 1b2f1db1b07cdd883417fb5f98e4c685fe32c515..efcb0cbe2e2a8d8bbf964cc4f2d2496e6a6fa991 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -120,23 +120,9 @@ template class FlattenContiguousRangeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto &start_axis = context.Attr("start_axis"); - auto &stop_axis = context.Attr("stop_axis"); - auto *in = context.Input("X"); - auto x_dims = in->dims(); - int in_dims_size = x_dims.size(); - int real_start_axis = start_axis, real_stop_axis = stop_axis; - if (start_axis < 0) { - real_start_axis = start_axis + in_dims_size; - } - if (stop_axis < 0) { - real_stop_axis = stop_axis + in_dims_size; - } auto *out = context.Output("Out"); - - auto out_dims = framework::make_ddim( - GetOutputShape(real_start_axis, real_stop_axis, x_dims)); + auto out_dims = out->dims(); out->mutable_data(context.GetPlace(), in->type()); framework::TensorCopy( @@ -144,27 +130,6 @@ class FlattenContiguousRangeKernel : public framework::OpKernel { context.template device_context(), out); out->Resize(out_dims); } - static std::vector GetOutputShape(const int start_axis, - const int stop_axis, - const framework::DDim &in_dims) { - int64_t outer = 1; - std::vector out_shape; - int in_dims_size = in_dims.size(); - out_shape.reserve(in_dims_size - stop_axis + start_axis); - - for (int i = 0; i < start_axis; ++i) { - out_shape.push_back(in_dims[i]); - } - for (int i = start_axis; i <= stop_axis; i++) { - outer *= in_dims[i]; - } - out_shape.push_back(outer); - for (int i = stop_axis + 1; i < in_dims_size; i++) { - out_shape.push_back(in_dims[i]); - } - - return out_shape; - } }; template diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 287827ced5115e1043f033fc966b0944f46494b1..104298e037319c6fbbfc8da830543fe06eb4dcac 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -32,8 +32,7 @@ if (WITH_GPU OR WITH_ROCM) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n") endif() # conv_fusion_op needs cudnn 7 above - # HIP not support cudnnConvolutionBiasActivationForward - if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) op_library(conv_fusion_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") endif() diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index c9ba7a61e0907f53888b7088a1fa203d10c569e0..f5ee7f559918457c600324bf2d24daa247c938da 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -18,14 +18,18 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/math/padding.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/miopen_helper.h" +#else #include "paddle/fluid/platform/cudnn_helper.h" +#endif DECLARE_int64(cudnn_exhaustive_search_times); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7100 +#if PADDLE_WITH_HIP || CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -162,7 +166,78 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { if (input->dims().size() == 5) { layout = DataLayout::kNCDHW; } +#ifdef PADDLE_WITH_HIP + miopenConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(padding_common, strides, dilations); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenSetConvolutionGroupCount(cudnn_conv_desc, + groups)); + // Now only support NCHW + std::vector bias_dim = { + 1, static_cast(transformed_output.dims()[1]), 1, 1}; + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize(transformed_output.dims())); + miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize(filter->dims())); + miopenTensorDescriptor_t cudnn_bias_desc = + bias_desc.descriptor(layout, bias_dim); + miopenActivationDescriptor_t cudnn_act_desc = + act_desc.descriptor(activation); + miopenConvFwdAlgorithm_t algo; + auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto x_dims = framework::vectorize(transformed_input.dims()); + auto f_dims = framework::vectorize(filter->dims()); + + size_t workspace_size = 0; + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_output_desc, &workspace_size)); + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionForwardAlgorithm( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.fwd_algo; + VLOG(3) << "cuDNN forward algo " << algo; + + { + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc, + output_data, cudnn_workspace, workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForwardBias( + handle, &alpha, cudnn_bias_desc, bias_data, &beta, + cudnn_output_desc, output_data)); + if (activation != "identity") { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward( + handle, cudnn_act_desc, &alpha, cudnn_output_desc, output_data, + &beta, cudnn_output_desc, output_data)); + } + if (residual) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, cudnn_output_desc, output_data, + &alpha, cudnn_output_desc, residual_data, &beta, cudnn_output_desc, + output_data)); + } + } +#else // PADDLE_WITH_HIP cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(padding_common, strides, dilations); PADDLE_ENFORCE_CUDA_SUCCESS( @@ -327,6 +402,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } +#endif std::vector channels = ctx.Attr>("split_channels"); if (channels.size()) { auto outs = ctx.MultiOutput("Outputs"); @@ -358,8 +434,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; +#if CUDNN_VERSION >= 7100 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, ops::CUDNNConvFusionOpKernel); #endif +#ifdef PADDLE_WITH_HIP +REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel); +#endif diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_activation_op.cc index 97cd4d90be689ac7e891af9fe098b56bea000166..e9ad2895e03db8e77470c490453427a41d8e3bba 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cc @@ -173,7 +173,9 @@ void FusedBatchNormActOpMaker::Make() { .AddCustomChecker([](const float &epsilon) { PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true, platform::errors::InvalidArgument( - "'epsilon' should be between 0.0 and 0.001.")); + "Attr(epsilon) should be between 0.0 and 0.001, " + "but received value is %f.", + epsilon)); }); AddAttr("act_type", "The activation type to be fused.") .SetDefault("relu"); diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc index 4ff66d0d2b856d505fade0510c22b565e0d94678..d51e0de38009bfdf5ba866240ead5c38d0d3c1cf 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.cc @@ -69,7 +69,7 @@ static bool IsSupportedCompound(const std::vector &functors) { functors.size(), 2)); static std::unordered_set unary_fun = {"scale", "relu", "tanh", - "sigmoid"}; + "sigmoid", "gelu"}; static std::unordered_set binary_fun = {"elementwise_add", "elementwise_mul"}; diff --git a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h index c61b9a9e48854aef094b1da239ae581e38d2e278..b7dd89a8a28adffc09b75a1845a79fb66c0b67c8 100644 --- a/paddle/fluid/operators/fused/fused_elemwise_activation_op.h +++ b/paddle/fluid/operators/fused/fused_elemwise_activation_op.h @@ -275,6 +275,13 @@ static void RunFunctors(const framework::ExecutionContext &ctx, paddle::operators::math::SigmoidFunctor>( ctx, paddle::operators::math::MulFunctor(), paddle::operators::math::SigmoidFunctor(), in_x, in_y, outputs); + } else if (funcs_str == "gelu,elementwise_add") { + // Z = Unary(Binary(X, Y)) + RunUnaryCompoundFunctors, + paddle::operators::math::AddFunctor>( + ctx, paddle::operators::math::GeluFunctor(), + paddle::operators::math::AddFunctor(), in_x, in_y, outputs); } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s has not been implemented.", funcs_str)); @@ -374,6 +381,16 @@ static void RunGradFunctors( paddle::operators::math::SigmoidFunctor(), paddle::operators::math::SigmoidGradFunctor(), in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); + } else if (funcs_str == "gelu_grad,elementwise_add_grad") { + // The backward of Z = Unary(Binary(X, Y)) + RunUnaryCompoundGradFunctors< + DeviceContext, T, paddle::operators::math::GeluGradFunctor, + paddle::operators::math::AddFunctor, + paddle::operators::math::AddGradFunctor, InPlace>( + ctx, paddle::operators::math::GeluGradFunctor(), + paddle::operators::math::AddFunctor(), + paddle::operators::math::AddGradFunctor(), in_x, in_y, in_out, + in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out); } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s has not been implemented.", funcs_str)); diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc index b53b407d4995da5d548a13fec20ff3b09a5583c4..4d270280d389c6d8c34e3a5691a41a684b537577 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cc @@ -25,11 +25,13 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* context) const override { - PADDLE_ENFORCE_EQ(context->Inputs("Ids").size(), - context->Inputs("Embs").size(), - platform::errors::InvalidArgument( - "Two inputs of EmbeddingEltWiseLayerNormOp shoube be " - "the same size")); + PADDLE_ENFORCE_EQ( + context->Inputs("Ids").size(), context->Inputs("Embs").size(), + platform::errors::InvalidArgument( + "Two inputs of EmbeddingEltWiseLayerNormOp shoube be " + "the same size, but received the size of input Ids = %d," + " the size of input Embs = %d", + context->Inputs("Ids").size(), context->Inputs("Embs").size())); PADDLE_ENFORCE_GE(context->Inputs("Embs").size(), 2UL, platform::errors::InvalidArgument( "Input Embs of EmbeddingEltWiseLayerNormOp should " @@ -77,7 +79,8 @@ class EmbeddingEltWiseLayerNormOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( embs_dims[i][1], hidden, platform::errors::InvalidArgument( - "The Emb first dim size(%d) shoule equal to hidden (%d).", + "The second dimension size(%d) of the Embedding should be " + "equal to the hidden's size(%d)", embs_dims[i][1], hidden)); } diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 6cca6b5a9729a7065e64771ec6bfb2b1cbb52cf5..42bf784b2af4fbcb1cde36d995f1152f0e31635b 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -249,6 +249,11 @@ void FusionLSTMOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "int8", "bfloat16"}); AddAttr("Scale_data", "Scale to be used for int8 input/output data." "Only used with MKL-DNN INT8.") diff --git a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc index bd376b1e7aaefbf890e174cc86899b990a9fed26..382d01f6a535c76bdd38102a0cb40e5afc345f07 100644 --- a/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc @@ -40,7 +40,9 @@ class TransposeFlattenConcatFusionOp : public framework::OperatorWithKernel { const size_t n = ins.size(); PADDLE_ENFORCE_GT(n, 0, platform::errors::InvalidArgument( - "Input tensors dim size should greater than 0.")); + "The size of Inputs(X)'s dimension should be greater " + " than 0, but received %d.", + n)); std::vector trans_axis = ctx->Attrs().Get>("trans_axis"); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c737ba361e0f2573d46def53d1b566774a4bd90f --- /dev/null +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h" +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SoftmaxMaskFuseUpperTriangleOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", + "SoftmaxMaskFuseUpperTriangle"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "SoftmaxMaskFuseUpperTriangle"); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ( + x_dims.size(), 4, + platform::errors::InvalidArgument("Input x must be in 4D dimension but " + "received the dimension of X is %d", + x_dims.size())); + + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class SoftmaxMaskFuseUpperTriangleOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input of softmax_mask_fuse_upper_triangle op, " + "which is the result of matmul(QK)/sqrt(dk)."); + AddOutput("Out", "The result of softmax_mask_fuse_upper_triangle op."); + + AddComment(R"DOC( +Softmax Mask Fuse Operator. +product = matmul(QK)/sqrt(dk) +output = softmax_mask_fuse_upper_triangle(product) +to get the final output. +)DOC"); + } +}; + +class SoftmaxMaskFuseUpperTriangleOpGrad + : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), + "SoftmaxMaskFuseUpperTriangleGrad"); + + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), out_dims); + ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X")); + } +}; + +template +class SoftmaxMaskFuseUpperTriangleGradOpMaker + : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("fused_softmax_mask_upper_triangle_grad"); + op->SetInput("Softmax", this->Output("Out")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + fused_softmax_mask_upper_triangle, ops::SoftmaxMaskFuseUpperTriangleOp, + ops::SoftmaxMaskFuseUpperTriangleOpMaker, + ops::SoftmaxMaskFuseUpperTriangleGradOpMaker, + ops::SoftmaxMaskFuseUpperTriangleGradOpMaker); +REGISTER_OPERATOR(fused_softmax_mask_upper_triangle_grad, + ops::SoftmaxMaskFuseUpperTriangleOpGrad); +REGISTER_OP_CPU_KERNEL(fused_softmax_mask_upper_triangle, + ops::SoftmaxMaskFuseUpperTriangleCPUKernel< + paddle::platform::CPUDeviceContext, float>, + ops::SoftmaxMaskFuseUpperTriangleCPUKernel< + paddle::platform::CPUDeviceContext, double>); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..3bebbee1fb7ccb0e465d84a542f214cb59ed54c6 --- /dev/null +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -0,0 +1,546 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +// this file is inspired by: +// https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h + +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif +#ifdef PADDLE_WITH_HIP +#include +#include +#endif +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { +using framework::Tensor; + +#ifdef PADDLE_WITH_HIP +#define WARP_SIZE 64 +#else +#define WARP_SIZE 32 +#endif + +#define MASK 0xffffffff + +namespace plat = paddle::platform; + +__device__ __inline__ void load_data_upper_tri(plat::float16* dst, + const plat::float16* src) { + *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); +} + +__device__ __inline__ void load_data_upper_tri(float* dst, const float* src) { + *(reinterpret_cast(dst)) = *(reinterpret_cast(src)); +} + +__device__ __inline__ void load_zero_vector_upper_tri(plat::float16* dst) { + *(reinterpret_cast(dst)) = make_float2(0.0f, 0.0f); +} + +__device__ __inline__ void load_zero_vector_upper_tri(float* dst) { + *(reinterpret_cast(dst)) = make_float4(0.0f, 0.0f, 0.0f, 0.0f); +} + +int get_pow2_index_value(int value) { + int pow2_index = 0; + while ((1 << pow2_index) < value) { + ++pow2_index; + } + return pow2_index; +} + +template +struct AddOP_upper_tri { + __device__ __forceinline__ T operator()(T a, T b) const { return a + b; } +}; + +template +struct MaxOP_upper_tri { + __device__ __forceinline__ T operator()(T a, T b) const { + return a < b ? b : a; + } +}; + +template +__device__ __forceinline__ T warp_shfl_xor_upper_tri(T value, int laneMask, + int width, + unsigned int mask = MASK) { +#if CUDA_VERSION >= 9000 + return __shfl_xor_sync(mask, value, laneMask, width); +#else + return __shfl_xor(value, laneMask, width); +#endif +} + +template class ReduceOp> +__device__ __forceinline__ void warp_reduce_upper_tri(T* sum) { + ReduceOp r; +#pragma unroll + for (int offset = width / 2; offset > 0; offset /= 2) { +#pragma unroll + for (int i = 0; i < batch; ++i) { + T b = warp_shfl_xor_upper_tri(sum[i], offset, width); + sum[i] = r(sum[i], b); + } + } +} + +template +__global__ void SoftmaxMaskFuseUpperTriangleGPUKernel(const T* src, T* dst, + int batch_count, + int key_seq_len) { + constexpr int next_pow2 = 1 << pow2_index; + constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE; + constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4); + constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1; + constexpr int kOneLoadingCounts = 4; + int key_seq_len_pow_2 = key_seq_len * key_seq_len; + + int first_idx = + (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * kLocalBatchSize + + blockIdx.x; + int local_block_idx = blockIdx.x + 1; + int warp_iter_upper_bound = + (local_block_idx + kOneLoadingCounts * warp_size - 1) / warp_size; + + int local_batches = batch_count - first_idx; + if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize; + + int local_idx = threadIdx.x; + + src += first_idx * key_seq_len + kOneLoadingCounts * local_idx; + dst += first_idx * key_seq_len + kOneLoadingCounts * local_idx; + + float data[kLocalBatchSize][kLocalIterations]; + T temp_in[kOneLoadingCounts]; + +#pragma unroll + for (int i = 0; i < kLocalBatchSize; ++i) { + int batch_total_number = (i >= local_batches) ? 0 : local_block_idx; + +#pragma unroll + for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { + int element_index = kOneLoadingCounts * local_idx + ii * warp_size; + + if (element_index < batch_total_number) { + load_data_upper_tri(temp_in, + src + i * key_seq_len_pow_2 + ii * warp_size); + +#pragma unroll + for (int counter = 0; counter < kOneLoadingCounts; ++counter) { + if ((element_index + counter) < batch_total_number) { + data[i][ii + counter] = static_cast(temp_in[counter]); + } else { + data[i][ii + counter] = -std::numeric_limits::infinity(); + } + } + } else { +#pragma unroll + for (int counter = 0; counter < kOneLoadingCounts; ++counter) { + data[i][ii + counter] = -std::numeric_limits::infinity(); + } + } + } + } + + float max_value[kLocalBatchSize]; +#pragma unroll + for (int i = 0; i < kLocalBatchSize; ++i) { + max_value[i] = data[i][0]; +#pragma unroll + for (int ii = 1; ii < kLocalIterations; ++ii) { + max_value[i] = (max_value[i] > data[i][ii]) ? max_value[i] : data[i][ii]; + } + } + warp_reduce_upper_tri( + max_value); + + float sum[kLocalBatchSize]{0.0f}; +#pragma unroll + for (int i = 0; i < kLocalBatchSize; ++i) { +#pragma unroll + for (int ii = 0; ii < kLocalIterations; ++ii) { + if (ii < warp_iter_upper_bound) { + data[i][ii] = std::exp((data[i][ii] - max_value[i])); + sum[i] += data[i][ii]; + } + } + } + warp_reduce_upper_tri( + sum); + + T out[kOneLoadingCounts]; +#pragma unroll + for (int i = 0; i < kLocalBatchSize; ++i) { + if (i >= local_batches) break; +#pragma unroll + for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { + int element_index = kOneLoadingCounts * local_idx + ii * warp_size; + + if (element_index < local_block_idx) { +#pragma unroll + for (int counter = 0; counter < kOneLoadingCounts; ++counter) { + if (element_index + counter < local_block_idx) { + out[counter] = data[i][ii + counter] / sum[i]; + } else { + out[counter] = 0; + } + } + load_data_upper_tri(dst + i * key_seq_len_pow_2 + ii * warp_size, out); + } else if (element_index < key_seq_len) { + load_zero_vector_upper_tri(dst + i * key_seq_len_pow_2 + + ii * warp_size); + } else { + break; + } + } + } +} + +template +__global__ void SoftmaxMaskFuseUpperTriangleGradGPUKernel(const T* grad_input, + T* grad_output, + const T* softmax_rst, + int batch_count, + int key_seq_len) { + constexpr int next_pow2 = 1 << pow2_index; + constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE; + constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4); + constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1; + constexpr int kOneLoadingCounts = 4; + int key_seq_len_pow_2 = key_seq_len * key_seq_len; + + int first_idx = + (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * kLocalBatchSize + + blockIdx.x; + int local_block_idx = blockIdx.x + 1; + + // micro_batch_size might not be a multiple of WARP_BATCH. Check how + // many batches have to computed within this WARP. + int local_batches = batch_count - first_idx; + if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize; + + // there might be multiple batches per warp. compute the index within the + // batch + int local_idx = threadIdx.x; + + // the first element to process by the current thread + int offset = first_idx * key_seq_len + kOneLoadingCounts * local_idx; + grad_input += offset; + grad_output += offset; + softmax_rst += offset; + + // load data from global memory + float grad_input_reg[kLocalBatchSize][kLocalIterations]{0.0f}; + float softmax_rst_reg[kLocalBatchSize][kLocalIterations]{0.0f}; + T temp_grad_input[kOneLoadingCounts]; + T temp_softmax_rst[kOneLoadingCounts]; + +#pragma unroll + for (int i = 0; i < kLocalBatchSize; ++i) { + int batch_total_number = (i >= local_batches) ? 0 : local_block_idx; + +#pragma unroll + for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { + int element_index = kOneLoadingCounts * local_idx + ii * warp_size; + if (element_index < batch_total_number) { + load_data_upper_tri( + temp_grad_input, + grad_input + i * key_seq_len_pow_2 + ii * warp_size); + load_data_upper_tri( + temp_softmax_rst, + softmax_rst + i * key_seq_len_pow_2 + ii * warp_size); + +#pragma unroll + for (int counter = 0; counter < kOneLoadingCounts; ++counter) { + if (element_index + counter < batch_total_number) { + softmax_rst_reg[i][ii + counter] = + static_cast(temp_softmax_rst[counter]); + } + } +#pragma unroll + for (int counter = 0; counter < kOneLoadingCounts; ++counter) { + if (element_index + counter < batch_total_number) { + grad_input_reg[i][ii + counter] = + static_cast(temp_grad_input[counter]) * + softmax_rst_reg[i][ii + counter]; + } + } + } + } + } + + float sum[kLocalBatchSize]; +#pragma unroll + for (int i = 0; i < kLocalBatchSize; ++i) { + sum[i] = grad_input_reg[i][0]; +#pragma unroll + for (int ii = 1; ii < kLocalIterations; ++ii) { + sum[i] += grad_input_reg[i][ii]; + } + } + warp_reduce_upper_tri( + sum); + +#pragma unroll + for (int i = 0; i < kLocalBatchSize; ++i) { + if (i >= local_batches) break; +#pragma unroll + for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) { + int element_index = kOneLoadingCounts * local_idx + ii * warp_size; + if (element_index < key_seq_len) { + // compute gradients + T samples_out[kOneLoadingCounts]; +#pragma unroll + for (int counter = 0; counter < kOneLoadingCounts; ++counter) { + samples_out[counter] = grad_input_reg[i][ii + counter] - + softmax_rst_reg[i][ii + counter] * sum[i]; + } + load_data_upper_tri( + grad_output + i * key_seq_len_pow_2 + ii * warp_size, samples_out); + } + } + } +} + +template +class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* y = context.Output("Out"); + + auto* x_data = x->data(); + auto* y_data = y->mutable_data(context.GetPlace()); + + auto x_dim = x->dims(); + auto batches = x_dim[0]; + auto attn_heads = x_dim[1]; + auto attn_mul_batch = batches * attn_heads; + auto query_seq_len = x_dim[2]; + auto key_seq_len = x_dim[3]; + + PADDLE_ENFORCE_EQ(key_seq_len, query_seq_len, + platform::errors::InvalidArgument( + "Key seq len must be equal with query seq len " + "received key len: %d, query len: %d", + key_seq_len, query_seq_len)); + + PADDLE_ENFORCE_EQ(key_seq_len >= 32 && key_seq_len < 8192, true, + platform::errors::InvalidArgument( + "Input x's last dim must be between [32, 8192) " + "received the last dimension of x is %d", + key_seq_len)); + + auto& place = *context.template device_context().eigen_device(); + auto stream = context.cuda_device_context().stream(); + + int pow2_index = get_pow2_index_value(key_seq_len); + const int next_pow2 = 1 << pow2_index; + int batch_count = attn_mul_batch * query_seq_len; + int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE; + int batches_per_warp = (next_pow2 <= 128) ? 2 : 1; + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + PADDLE_ENFORCE_EQ( + query_seq_len % batches_per_block, 0, + platform::errors::InvalidArgument( + "The query seq len (third dim of input X) must can divide the " + "number of batches per block. The query seq len is %d, while " + "the number of batches per block is %d.", + query_seq_len, batches_per_block)); + dim3 blocks(query_seq_len, + (attn_mul_batch + batches_per_block) / batches_per_block, 1); + dim3 threads(warp_size, warps_per_block, 1); + + switch (pow2_index) { + case 5: // 32 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 5><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 6: // 64 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 6><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 7: // 128 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 7><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 8: // 256 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 8><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 9: // 512 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 9><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 10: // 1024 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 10><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 11: // 2048 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 11><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 12: // 4096 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 12><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + case 13: // 8192 + SoftmaxMaskFuseUpperTriangleGPUKernel< + T, 13><<>>(x_data, y_data, batch_count, + key_seq_len); + break; + default: + break; + } + } +}; + +template +class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* grad_x = context.Output(framework::GradVarName("X")); + auto* grad_y = context.Input(framework::GradVarName("Out")); + auto* softmax_rst = context.Input("Softmax"); + + auto* grad_x_data = grad_x->mutable_data(context.GetPlace()); + auto* grad_y_data = grad_y->data(); + auto* softmax_rst_data = softmax_rst->data(); + + auto y_dim = grad_y->dims(); + auto batches = y_dim[0]; + auto attn_heads = y_dim[1]; + auto attn_mul_batch = batches * attn_heads; + auto query_seq_len = y_dim[2]; + auto key_seq_len = y_dim[3]; + + auto& place = *context.template device_context().eigen_device(); + auto stream = context.cuda_device_context().stream(); + + int pow2_index = get_pow2_index_value(key_seq_len); + const int next_pow2 = 1 << pow2_index; + int batch_count = attn_mul_batch * query_seq_len; + int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE; + int batches_per_warp = (next_pow2 <= 128) ? 2 : 1; + // use 128 threads per block to maximum gpu utilization + constexpr int threads_per_block = 128; + + int warps_per_block = (threads_per_block / warp_size); + int batches_per_block = warps_per_block * batches_per_warp; + dim3 blocks(query_seq_len, + (attn_mul_batch + batches_per_block) / batches_per_block, 1); + dim3 threads(warp_size, warps_per_block, 1); + + switch (pow2_index) { + case 5: // 32 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 5><<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); + break; + case 6: // 64 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 6><<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); + break; + case 7: // 128 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 7><<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); + break; + case 8: // 256 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 8><<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); + break; + case 9: // 512 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 9><<>>(grad_y_data, grad_x_data, + softmax_rst_data, batch_count, + key_seq_len); + break; + case 10: // 1024 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 10><<>>(grad_y_data, grad_x_data, + softmax_rst_data, + batch_count, key_seq_len); + break; + case 11: // 2048 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 11><<>>(grad_y_data, grad_x_data, + softmax_rst_data, + batch_count, key_seq_len); + break; + case 12: // 4096 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 12><<>>(grad_y_data, grad_x_data, + softmax_rst_data, + batch_count, key_seq_len); + break; + case 13: // 8192 + SoftmaxMaskFuseUpperTriangleGradGPUKernel< + T, 13><<>>(grad_y_data, grad_x_data, + softmax_rst_data, + batch_count, key_seq_len); + break; + default: + break; + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + fused_softmax_mask_upper_triangle, + ops::SoftmaxMaskFuseUpperTriangleKernel, + ops::SoftmaxMaskFuseUpperTriangleKernel); +REGISTER_OP_CUDA_KERNEL( + fused_softmax_mask_upper_triangle_grad, + ops::SoftmaxMaskFuseUpperTriangleGradKernel, + ops::SoftmaxMaskFuseUpperTriangleGradKernel); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h new file mode 100644 index 0000000000000000000000000000000000000000..61dc571066d2bac4ae137001b0bc203e3e5e210e --- /dev/null +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class SoftmaxMaskFuseUpperTriangleCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::Unimplemented( + "Softmax mask fuse op only supports GPU now.")); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index 94fe45dac0ce782d6e8f81c737de10b5aefdaaa5..6469307bc5652228e81bd84180f5975b52f4453b 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -30,13 +30,20 @@ using platform::DeviceContext; template __global__ void GatherCUDAKernel(const T* params, const IndexT* indices, - T* output, size_t index_size, - size_t slice_size) { + T* output, size_t input_size, + size_t index_size, size_t slice_size) { CUDA_KERNEL_LOOP(i, index_size * slice_size) { int indices_i = i / slice_size; int slice_i = i - indices_i * slice_size; // offset inside the slice IndexT gather_i = indices[indices_i]; IndexT params_i = gather_i * slice_size + slice_i; + PADDLE_ENFORCE( + gather_i >= 0 && gather_i < input_size, + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be less than [%d] and greater than or equal to 0, but received [%d]", + input_size, gather_i); *(output + i) = *(params + params_i); } } @@ -58,7 +65,7 @@ __global__ void GatherNdCUDAKernel(const T* input, const int* input_dims, "The index is out of bounds, " "please check whether the dimensions of index and " "input meet the requirements. It should " - "be less than [%d] and greater or equal to 0, but received [%d]", + "be less than [%d] and greater than or equal to 0, but received [%d]", input_dims[j], index_value); gather_i += (index_value * temp); temp *= input_dims[j]; @@ -91,6 +98,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, " the second dimension should be 1.")); } + // index size int index_size = index.dims()[0]; auto src_dims = src.dims(); @@ -100,6 +108,8 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, // slice size int slice_size = 1; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + // input size + int input_size = src_dims[0] * slice_size; const T* p_src = src.data(); const IndexT* p_index = index.data(); @@ -112,7 +122,7 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, GatherCUDAKernel<<< grid, block, 0, reinterpret_cast(ctx).stream()>>>( - p_src, p_index, p_output, index_size, slice_size); + p_src, p_index, p_output, input_size, index_size, slice_size); } template @@ -177,6 +187,15 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out, int next_idx = idx - outer_size * inner_dim_index; int index_dim_index = next_idx / outer_dim_size; int index_val = index[index_dim_index]; + + PADDLE_ENFORCE( + index_val >= 0 && index_val < input_index_dim_size, + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be less than [%d] and greater than or equal to 0, but received [%d]", + input_index_dim_size, index_val); + int out_dim_index = next_idx - outer_dim_size * index_dim_index; int input_index = inner_dim_index * (outer_dim_size * input_index_dim_size) + @@ -202,12 +221,11 @@ __global__ void GatherGradGPUKernel(const T* input, const U* index, T* out, } } -template +template void GatherV2CUDAFunction(const Tensor* input, const Tensor* index, - const Tensor* axis, Tensor* out, + const int axis, Tensor* out, const paddle::platform::Place& place, const framework::ExecutionContext& ctx) { - int axis_size = axis->numel(); int index_size = index->numel(); int input_size = input->numel(); auto input_dim = input->dims(); @@ -215,12 +233,8 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index, auto* index_data = index->data(); if (input->numel() == 0) return; - PADDLE_ENFORCE_EQ(axis_size, 1, - platform::errors::InvalidArgument( - "Axis size should be 1, but received %d", axis_size)); - Tensor cpu_axis; - framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis); - int axis_index = cpu_axis.data()[0]; + + int axis_index = axis; int index_dim_size = input_dim[axis_index]; int inner_dim_size = 1; @@ -251,26 +265,19 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index, index_size, index_dim_size, out_size); } -template +template void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index, - const Tensor* axis, Tensor* out, + const int axis, Tensor* out, const paddle::platform::Place& place, const framework::ExecutionContext& ctx) { auto* index_data = index->data(); - - int axis_size = axis->numel(); int index_size = index->numel(); int input_size = input->numel(); auto input_dim = input->dims(); auto* input_data = input->data(); if (input->numel() == 0) return; - PADDLE_ENFORCE_EQ(axis_size, 1, - platform::errors::InvalidArgument( - "Axis size should be 1, but received %d", axis_size)); - Tensor cpu_axis; - framework::TensorCopy(*axis, platform::CPUPlace(), &cpu_axis); - int axis_index = cpu_axis.data()[0]; + int axis_index = axis; int input_index_dim_size = input_dim[axis_index]; int inner_dim_size = 1; diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h index c12a3b8adc97893f523b307a56c0e6b04ea8d675..43dc8240633fd24ab7b193217858fc7b42ebd02f 100644 --- a/paddle/fluid/operators/gather.h +++ b/paddle/fluid/operators/gather.h @@ -67,11 +67,25 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, // slice size int slice_size = 1; for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + // input size + int input_size = src_dims[0] * slice_size; const size_t slice_bytes = slice_size * sizeof(T); for (int64_t i = 0; i < index_size; ++i) { IndexT index_ = p_index[i]; + PADDLE_ENFORCE_LT(p_index[i], input_size, + platform::errors::OutOfRange( + "The element of Index must be less than the size of " + "input dim size of axis which is %d, but received " + "index element which is %d in the %d index.", + input_size, p_index[i], i)); + PADDLE_ENFORCE_GE(p_index[i], 0, + platform::errors::OutOfRange( + "The element of Index must be greater than or equal " + "to 0, but received index element which is %d in the " + "%d index.", + p_index[i], i)); memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); } } @@ -114,7 +128,7 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input, platform::errors::InvalidArgument( "Input(index[-1)] has wrong value, it is [%d]", index_value)); PADDLE_ENFORCE_GE( - index_value, 0UL, + index_value, 0, platform::errors::InvalidArgument( "The value of Input(index) must be no less than 0")); @@ -126,33 +140,32 @@ void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input, } } -template -void GatherV2Function(const Tensor* input, const Tensor* index, - const Tensor* axis, Tensor* out, - const paddle::platform::Place& place) { - auto* axis_data = axis->data(); +template +void GatherV2Function(const Tensor* input, const Tensor* index, int axis, + Tensor* out, const paddle::platform::Place& place) { auto* index_data = index->data(); - - int axis_size = axis->numel(); int index_size = index->numel(); int input_size = input->numel(); auto input_dim = input->dims(); auto* input_data = input->data(); if (input->numel() == 0) return; - PADDLE_ENFORCE_EQ(axis_size, 1, - platform::errors::InvalidArgument( - "Axis size should be 1, but received %d", axis_size)); - int axis_index = axis_data[0]; + int axis_index = axis; int input_index_dim_size = input_dim[axis_index]; for (int i = 0; i < index_size; i++) { PADDLE_ENFORCE_LT(index_data[i], input_index_dim_size, - platform::errors::InvalidArgument( + platform::errors::OutOfRange( "The element of Index must be less than the size of " "input dim size of axis which is %d, but received " "index element which is %d in the %d index.", input_index_dim_size, index_data[i], i)); + PADDLE_ENFORCE_GE(index_data[i], 0, + platform::errors::OutOfRange( + "The element of Index must be greater than or equal " + "to 0, but received index element which is %d in the " + "%d index.", + index_data[i], i)); } int inner_dim_size = 1; @@ -186,22 +199,17 @@ void GatherV2Function(const Tensor* input, const Tensor* index, } } -template +template void GatherV2GradFunction(const Tensor* input, const Tensor* index, - const Tensor* axis, Tensor* out, + const int axis, Tensor* out, const paddle::platform::Place& place) { - auto* axis_data = axis->data(); auto* index_data = index->data(); - int axis_size = axis->numel(); auto input_dim = input->dims(); auto* input_data = input->data(); if (input->numel() == 0) return; - PADDLE_ENFORCE_EQ(axis_size, 1, - platform::errors::InvalidArgument( - "Axis size should be 1, but received %d", axis_size)); - int axis_index = axis_data[0]; + int axis_index = axis; int input_index_dim_size = input_dim[axis_index]; int inner_dim_size = 1; diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 162766546b3c264ebaf6d833adf9b04c38251f8e..ea28c204ec9cf9e63f1dace5c4a9188b0f1c1719 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { @@ -52,11 +53,29 @@ class GatherOp : public framework::OperatorWithKernel { index_dims.size())); } - int batch_size = ctx->GetInputDim("Index")[0]; - framework::DDim output_dims(ctx->GetInputDim("X")); - output_dims[0] = batch_size; - ctx->SetOutputDim("Out", output_dims); - ctx->ShareLoD("X", /*->*/ "Out"); + auto axis = ctx->Attrs().Get("axis"); + auto input_dim = ctx->GetInputDim("X"); + if (ctx->HasInput("Axis") || axis == 0) { + // if HasInput("Axis"), we can not obtain correct shape of output + int batch_size = index_dims[0]; + framework::DDim output_dims(input_dim); + output_dims[0] = batch_size; + ctx->SetOutputDim("Out", output_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } else { + int index_size = index_dims[0]; + std::vector out_dim_vec; + for (int i = 0; i < axis; i++) { + out_dim_vec.push_back(input_dim[i]); + } + out_dim_vec.push_back(index_size); + for (int i = axis + 1; i < input_dim.size(); i++) { + out_dim_vec.push_back(input_dim[i]); + } + auto output_dims = framework::make_ddim(out_dim_vec); + ctx->SetOutputDim("Out", output_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } } protected: @@ -120,6 +139,10 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker { "If true, update the grad using the overwrite mode in same index," "If false, using the accumulate mode in same index.") .SetDefault(true); + AddAttr( + "axis", + "The Tensor which contains the axis that we do gather operation.") + .SetDefault(0); AddComment(R"DOC( Gather Operator. diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 37fbfb21f60a0568390c6798dc305c91fc8af886..6e27d95e01855ce6aa15e51b5a4768509be440f6 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -31,47 +31,33 @@ class GatherOpCUDAKernel : public framework::OpKernel { auto *index = ctx.Input("Index"); auto *output = ctx.Output("Out"); + int axis = ctx.Attr("axis"); + + // get axis from tensor if (ctx.HasInput("Axis")) { - const Tensor *axis = ctx.Input("Axis"); - const auto &index_type = index->type(); - const auto &axis_type = axis->type(); - auto place = ctx.GetPlace(); - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT32) { - GatherV2CUDAFunction(x, index, axis, output, place, - ctx); - } - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT64) { - GatherV2CUDAFunction(x, index, axis, output, place, - ctx); + Tensor cpu_axis; + const Tensor *axis_tensor = ctx.Input("Axis"); + framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); + const auto &axis_type = axis_tensor->type(); + if (axis_type == framework::proto::VarType::INT32) { + axis = static_cast(cpu_axis.data()[0]); + } else if (axis_type == framework::proto::VarType::INT64) { + axis = static_cast(cpu_axis.data()[0]); } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT32) { - GatherV2CUDAFunction(x, index, axis, output, place, - ctx); - } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT64) { - GatherV2CUDAFunction(x, index, axis, output, place, - ctx); + } + const auto &place = ctx.GetPlace(); + const auto &index_type = index->type(); + if (axis != 0) { + if (index_type == framework::proto::VarType::INT32) { + GatherV2CUDAFunction(x, index, axis, output, place, ctx); + } else if (index_type == framework::proto::VarType::INT64) { + GatherV2CUDAFunction(x, index, axis, output, place, ctx); } return; } + output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - const auto &index_type = index->type(); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); if (index_type == framework::proto::VarType::INT32) { GPUGather(ctx.device_context(), *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { @@ -91,30 +77,27 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { auto *dX = ctx.Output(framework::GradVarName("X")); auto *dO = ctx.Input(framework::GradVarName("Out")); + int axis = ctx.Attr("axis"); if (ctx.HasInput("Axis")) { - const Tensor *axis = ctx.Input("Axis"); - const auto &index_type = index->type(); - const auto &axis_type = axis->type(); - auto place = ctx.GetPlace(); - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT32) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - place, ctx); + const Tensor *axis_tensor = ctx.Input("Axis"); + Tensor cpu_axis; + framework::TensorCopy(*axis_tensor, platform::CPUPlace(), &cpu_axis); + const auto &axis_type = axis_tensor->type(); + if (axis_type == framework::proto::VarType::INT32) { + axis = static_cast(cpu_axis.data()[0]); + } else if (axis_type == framework::proto::VarType::INT64) { + axis = static_cast(cpu_axis.data()[0]); } - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT64) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - place, ctx); - } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT32) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - place, ctx); - } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT64) { - GatherV2GradCUDAFunction(dO, index, axis, dX, - place, ctx); + } + + const auto &index_type = index->type(); + if (axis != 0) { + if (index_type == framework::proto::VarType::INT32) { + GatherV2GradCUDAFunction(dO, index, axis, dX, + ctx.GetPlace(), ctx); + } else if (index_type == framework::proto::VarType::INT64) { + GatherV2GradCUDAFunction(dO, index, axis, dX, + ctx.GetPlace(), ctx); } return; } @@ -125,19 +108,6 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); if (dO->numel() == 0) return; - - const auto &index_type = index->type(); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); if (index_type == framework::proto::VarType::INT32) { GPUScatterAssign(ctx, *dO, *index, dX, ctx.Attr("overwrite")); diff --git a/paddle/fluid/operators/gather_op.h b/paddle/fluid/operators/gather_op.h index 8ec0d6ce0b69c791f9bff58f1681f8d4543c57dd..a2570c3e014e11ec10bc98d22607572e2b92d6e5 100644 --- a/paddle/fluid/operators/gather_op.h +++ b/paddle/fluid/operators/gather_op.h @@ -35,45 +35,30 @@ class GatherOpKernel : public framework::OpKernel { auto *index = ctx.Input("Index"); auto *output = ctx.Output("Out"); + int axis = ctx.Attr("axis"); + // get axis from tensor if (ctx.HasInput("Axis")) { - const Tensor *axis = ctx.Input("Axis"); - const auto &index_type = index->type(); - const auto &axis_type = axis->type(); - auto place = ctx.GetPlace(); - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT32) { - GatherV2Function(x, index, axis, output, place); + const Tensor *axis_tensor = ctx.Input("Axis"); + const auto &axis_type = axis_tensor->type(); + if (axis_type == framework::proto::VarType::INT32) { + axis = static_cast(axis_tensor->data()[0]); + } else if (axis_type == framework::proto::VarType::INT64) { + axis = static_cast(axis_tensor->data()[0]); } - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT64) { - GatherV2Function(x, index, axis, output, place); - } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT32) { - GatherV2Function(x, index, axis, output, place); - } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT64) { - GatherV2Function(x, index, axis, output, place); + } + const auto &place = ctx.GetPlace(); + const auto &index_type = index->type(); + if (axis != 0) { + if (index_type == framework::proto::VarType::INT32) { + GatherV2Function(x, index, axis, output, place); + } else if (index_type == framework::proto::VarType::INT64) { + GatherV2Function(x, index, axis, output, place); } return; } output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - - const auto &index_type = index->type(); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); if (index_type == framework::proto::VarType::INT32) { CPUGather(ctx.device_context(), *x, *index, output); } else if (index_type == framework::proto::VarType::INT64) { @@ -94,26 +79,23 @@ class GatherGradientOpKernel : public framework::OpKernel { auto *dX = ctx.Output(framework::GradVarName("X")); auto *dO = ctx.Input(framework::GradVarName("Out")); + int axis = ctx.Attr("axis"); if (ctx.HasInput("Axis")) { - const Tensor *axis = ctx.Input("Axis"); - const auto &index_type = index->type(); - const auto &axis_type = axis->type(); - auto place = ctx.GetPlace(); - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT32) { - GatherV2GradFunction(dO, index, axis, dX, place); + const Tensor *axis_tensor = ctx.Input("Axis"); + const auto &axis_type = axis_tensor->type(); + if (axis_type == framework::proto::VarType::INT32) { + axis = static_cast(axis_tensor->data()[0]); + } else if (axis_type == framework::proto::VarType::INT64) { + axis = static_cast(axis_tensor->data()[0]); } - if (index_type == framework::proto::VarType::INT32 && - axis_type == framework::proto::VarType::INT64) { - GatherV2GradFunction(dO, index, axis, dX, place); - } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT32) { - GatherV2GradFunction(dO, index, axis, dX, place); - } - if (index_type == framework::proto::VarType::INT64 && - axis_type == framework::proto::VarType::INT64) { - GatherV2GradFunction(dO, index, axis, dX, place); + } + const auto &index_type = index->type(); + + if (axis != 0) { + if (index_type == framework::proto::VarType::INT32) { + GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); + } else if (index_type == framework::proto::VarType::INT64) { + GatherV2GradFunction(dO, index, axis, dX, ctx.GetPlace()); } return; } @@ -126,18 +108,6 @@ class GatherGradientOpKernel : public framework::OpKernel { if (dO->numel() == 0) return; bool overwrite = ctx.Attr("overwrite"); - const auto &index_type = index->type(); - bool index_type_match = index_type == framework::proto::VarType::INT32 || - index_type == framework::proto::VarType::INT64; - PADDLE_ENFORCE_EQ(index_type_match, true, - platform::errors::InvalidArgument( - "Index holds the wrong type, it holds [%s]," - "but desires to be [%s] or [%s].", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT64))); if (index_type == framework::proto::VarType::INT32) { if (overwrite) { ScatterAssign(ctx.device_context(), *dO, *index, dX); diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index 1ee8889995f4d6045f237aa51e00faff7f67b2a3..7c6dd418071ba30e94f9316cb9f9fbd0641e1619 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -33,8 +33,8 @@ class GatherOpNPUKernel : public framework::OpKernel { auto *out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("Gather", {*x, *index}, {*out}, - {{"validate_indices", true}}); + const auto &runner = NpuOpRunner("Gather", {*x, *index}, {*out}, + {{"validate_indices", true}}); auto stream = ctx.template device_context() .stream(); @@ -75,7 +75,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel { zeroslike_xout.numel() * sizeof(T), stream); // step3: scatter(x_grad) - auto runner_scatter = NpuOpRunner( + const auto &runner_scatter = NpuOpRunner( "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {}); runner_scatter.Run(stream); } diff --git a/paddle/fluid/operators/gather_op_xpu.cc b/paddle/fluid/operators/gather_op_xpu.cc index ae3d0f2633bb18d469b5f755fb81bafab5bab10d..6d1dac830405079feb9333c86b755682dcdba13c 100644 --- a/paddle/fluid/operators/gather_op_xpu.cc +++ b/paddle/fluid/operators/gather_op_xpu.cc @@ -40,16 +40,6 @@ class GatherOpXPUKernel : public framework::OpKernel { output->mutable_data(ctx.GetPlace()); if (x->numel() == 0) return; - // check index type is INT32 - const auto &index_type = index->type(); - bool index_type_match = index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "XPU only support INT32, it holds %s, but desires to be %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32))); const auto index_dims = index->dims(); if (index_dims.size() == 2) { @@ -65,14 +55,26 @@ class GatherOpXPUKernel : public framework::OpKernel { "The index should be 1D, when it is not 2D, but we get %d", index_dims.size())); } - int slice_size = x->numel() / x->dims()[0]; + std::vector xshape(x->dims().size()); + for (int i = 0; i < x->dims().size(); ++i) { + xshape[i] = x->dims()[i]; + } + auto &dev_ctx = ctx.template device_context(); - int r = - xpu::gather(dev_ctx.x_context(), x->data(), index->data(), - index->dims()[0], slice_size, output->data()); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU kernel error! error code=%d", r)); + int r = XPU_SUCCESS; + if (index->type() == framework::proto::VarType::INT32) { + r = xpu::gather(dev_ctx.x_context(), x->data(), + index->data(), output->data(), xshape, + index->dims()[0], 0); + } else { + r = xpu::gather(dev_ctx.x_context(), x->data(), + index->data(), output->data(), + xshape, index->dims()[0], 0); + } + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "XPU gather kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; @@ -93,30 +95,11 @@ class GatherGradOpXPUKernel : public framework::OpKernel { PADDLE_THROW(platform::errors::InvalidArgument( "Now, it doesn't support XPU with Axis.")); } - - dx->mutable_data(ctx.GetPlace()); - const int zero = 0; - int r_dx = xpu::memset(dev_ctx.x_context(), dx->data(), zero, - dx->numel() * sizeof(T)); - PADDLE_ENFORCE_EQ( - r_dx, xpu::Error_t::SUCCESS, - platform::errors::External("XPU kernel error! error code=%d", r_dx)); - if (dout->numel() == 0) { return; } - bool overwrite = ctx.Attr("overwrite"); - // check index type is INT32 - const auto &index_type = index->type(); - bool index_type_match = index_type == framework::proto::VarType::INT32; - PADDLE_ENFORCE_EQ( - index_type_match, true, - platform::errors::InvalidArgument( - "XPU only support INT32, it holds %s, but desires to be %s", - paddle::framework::DataTypeToString(index_type), - paddle::framework::DataTypeToString( - framework::proto::VarType::INT32))); + bool overwrite = ctx.Attr("overwrite"); const auto index_dims = index->dims(); if (index_dims.size() == 2) { PADDLE_ENFORCE_EQ( @@ -131,16 +114,27 @@ class GatherGradOpXPUKernel : public framework::OpKernel { "The index should be 1D, when it is not 2D, but we get %d", index_dims.size())); } + std::vector xshape(dx->dims().size()); + for (int i = 0; i < dx->dims().size(); ++i) { + xshape[i] = dx->dims()[i]; + } - int index_size = index_dims[0]; - int slice_size = dout->numel() / dout->dims()[0]; + dx->mutable_data(ctx.GetPlace()); - int r = xpu::scatter(dev_ctx.x_context(), dout->data(), - index->data(), index_size, slice_size, - dx->data(), overwrite); - PADDLE_ENFORCE_EQ( - r, xpu::Error_t::SUCCESS, - platform::errors::External("XPU kernel error! error code=%d", r)); + int r = XPU_SUCCESS; + if (index->type() == framework::proto::VarType::INT32) { + r = xpu::gather_grad(dev_ctx.x_context(), dout->data(), + index->data(), dx->data(), xshape, + index->dims()[0], 0, overwrite); + } else { + r = xpu::gather_grad(dev_ctx.x_context(), dout->data(), + index->data(), dx->data(), + xshape, index->dims()[0], 0, overwrite); + } + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "XPU gather grad kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; diff --git a/paddle/fluid/operators/gaussian_random_op_npu.cc b/paddle/fluid/operators/gaussian_random_op_npu.cc new file mode 100755 index 0000000000000000000000000000000000000000..b5ca26edf8fae44e13cdd91bf1337d6b12c91864 --- /dev/null +++ b/paddle/fluid/operators/gaussian_random_op_npu.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/fill_constant_op.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +class NPUGaussianRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + float mean = context.Attr("mean"); + float std = context.Attr("std"); + auto* tensor = context.Output("Out"); + tensor->mutable_data(context.GetPlace()); + + Tensor cpu_tensor(tensor->type()); + cpu_tensor.Resize(tensor->dims()); + T* cpu_data = cpu_tensor.mutable_data(platform::CPUPlace()); + std::normal_distribution dist(mean, std); + + int64_t size = tensor->numel(); + + unsigned int seed = static_cast(context.Attr("seed")); + auto engine = framework::GetCPURandomEngine(seed); + for (int64_t i = 0; i < size; ++i) { + cpu_data[i] = dist(*engine); + } + framework::TensorCopy( + cpu_tensor, context.GetPlace(), + context.template device_context(), tensor); + context.template device_context() + .Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL(gaussian_random, ops::NPUGaussianRandomKernel); diff --git a/paddle/fluid/operators/gelu_op_npu.cc b/paddle/fluid/operators/gelu_op_npu.cc index 56aa509177cfd3e5ecfd521e0b66fd72fc708c38..4db82e96cfae7c3a0332f5601b3477780c3d16d1 100644 --- a/paddle/fluid/operators/gelu_op_npu.cc +++ b/paddle/fluid/operators/gelu_op_npu.cc @@ -39,7 +39,7 @@ class GeluNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner = NpuOpRunner("Gelu", {*x}, {*out}, {}); + const auto& runner = NpuOpRunner("Gelu", {*x}, {*out}, {}); runner.Run(stream); } }; @@ -61,13 +61,15 @@ class GeluGradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - Tensor out(x->type()); - out.mutable_data(x->dims(), place); - auto out_runner = NpuOpRunner("Gelu", {*x}, {out}, {}); - out_runner.Run(stream); - - auto dx_runner = NpuOpRunner("GeluGrad", {*dout, *x, out}, {*dx}, {}); - dx_runner.Run(stream); + // NOTE(pangyoki): In the original implementation of GeluGrad op, the input + // is {*dout, *x, out}, where out = Gelu(x). However, we find that variable + // `out` was not actually used. In order to improve performance, the + // useless GELU operation was deleted. + // We directly use `*dout` as a placeholder to replace `out`, it will not + // be used in calculations. + const auto& runner_dx = + NpuOpRunner("GeluGrad", {*dout, *x, *dout}, {*dx}, {}); + runner_dx.Run(stream); } }; diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu index e9b0a0108afc2336aa3bf350173ea4fa38635593..762d14096a5ab4d094894ad7c0ec822f5cc25d3b 100644 --- a/paddle/fluid/operators/grid_sampler_op.cu +++ b/paddle/fluid/operators/grid_sampler_op.cu @@ -187,7 +187,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, int out_sC = out_h * out_w; int out_sH = out_w; int out_sW = 1; - CUDA_KERNEL_LOOP(index, nthreads) { const int w = index % out_w; const int h = (index / out_w) % out_h; @@ -199,7 +198,6 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, ix = compute_positions(ix, in_w, padding_mode, align_corners); iy = compute_positions(iy, in_h, padding_mode, align_corners); - if (mode == Mode::bilinear) { int ix_nw = static_cast(floor(ix)); int iy_nw = static_cast(floor(iy)); @@ -216,6 +214,7 @@ __global__ void grid_sample_cuda_kernel(const int nthreads, int n, int out_c, T se = (ix - ix_nw) * (iy - iy_nw); auto inp_offset_NC = n * inp_sN; + auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW; for (int c = 0; c < out_c; ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) { @@ -291,17 +290,17 @@ class GridSampleOpCUDAKernel : public framework::OpKernel { << "; out_w: " << out_w; auto* output = ctx.Output("Output"); auto* output_data = output->mutable_data(ctx.GetPlace()); - - VLOG(3) << "set constant"; + VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1] + << "; " << output->dims()[2] << "; " << output->dims()[3]; math::SetConstant()( dev_ctx, output, static_cast(0)); int count = static_cast(n * out_h * out_w); - auto cu_stream = dev_ctx.stream(); - - int block = 512; - int grid_size = (count + block - 1) / block; - grid_sample_cuda_kernel<<>>( + int block_size = 512; + int grid_size = (count + block_size - 1) / block_size; + VLOG(3) << "cuda launch - grid dims: " << grid_size << "; block dims" + << block_size; + grid_sample_cuda_kernel<<>>( count, n, c, out_h, out_w, in_h, in_w, input->data(), grid->data(), output_data, mode, padding_mode, align_corners); } @@ -475,9 +474,12 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel { int count = static_cast(n * out_h * out_w); auto cu_stream = dev_ctx.stream(); - int block = 512; - int grid_size = (count + block - 1) / block; - grid_sampler_cuda_backward_kernel<<>>( + int block_size = 512; + int grid_size = (count + block_size - 1) / block_size; + VLOG(3) << "cuda launch grad kernel - grid dims: " << grid_size + << "; block dims" << block_size << "; count: " << count; + grid_sampler_cuda_backward_kernel< + T><<>>( count, output_grad->data(), input->data(), grid->data(), n, c, out_h, out_w, in_h, in_w, input_grad->data(), grid_grad_data, mode, padding_mode, align_corners); diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h index afe70ea64a99977737333168ab7ccff154d57668..2f0edd0451a3b76aa25a38de5febbabd70cf838d 100644 --- a/paddle/fluid/operators/group_norm_op.h +++ b/paddle/fluid/operators/group_norm_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include +#include #include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" @@ -73,6 +75,11 @@ class GroupNormKernel : public framework::OpKernel { auto* iter_y_data = y_data; for (int bid = 0; bid < x_dims[0]; bid++) { for (int gid = 0; gid < groups; gid++) { + const int64_t M = 8; + std::array x_mean_arr; + std::array x_var_arr; + std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0)); + std::fill(x_var_arr.begin(), x_var_arr.end(), T(0)); T x_mean = 0, x_var = 0; int number = std::min(group_size, static_cast(C - gid * group_size)); @@ -83,7 +90,37 @@ class GroupNormKernel : public framework::OpKernel { if (data_layout == DataLayout::kNCHW) { for (int cid = 0; cid < number; cid++) { - for (int imid = 0; imid < imsize; imid++, iter_x_data++) { + int imid; + for (imid = 0; imid < imsize - (imsize % M); + imid += M, iter_x_data += M) { + // TODO(gaoxiang) :Because AVX/AVX2/AVX512 can not directly used + // in template class/function, before we complete high + // performance cpu vector extension, temporarily unrolling + // loop to get high precision and performance + x_mean_arr[0] += iter_x_data[0]; + x_var_arr[0] += iter_x_data[0] * iter_x_data[0]; + x_mean_arr[1] += iter_x_data[1]; + x_var_arr[1] += iter_x_data[1] * iter_x_data[1]; + x_mean_arr[2] += iter_x_data[2]; + x_var_arr[2] += iter_x_data[2] * iter_x_data[2]; + x_mean_arr[3] += iter_x_data[3]; + x_var_arr[3] += iter_x_data[3] * iter_x_data[3]; + x_mean_arr[4] += iter_x_data[4]; + x_var_arr[4] += iter_x_data[4] * iter_x_data[4]; + x_mean_arr[5] += iter_x_data[5]; + x_var_arr[5] += iter_x_data[5] * iter_x_data[5]; + x_mean_arr[6] += iter_x_data[6]; + x_var_arr[6] += iter_x_data[6] * iter_x_data[6]; + x_mean_arr[7] += iter_x_data[7]; + x_var_arr[7] += iter_x_data[7] * iter_x_data[7]; + } + x_mean = + std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean); + x_var = + std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var); + std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0)); + std::fill(x_var_arr.begin(), x_var_arr.end(), T(0)); + for (; imid < imsize; imid++, iter_x_data++) { x_mean += iter_x_data[0]; x_var += iter_x_data[0] * iter_x_data[0]; } @@ -91,7 +128,37 @@ class GroupNormKernel : public framework::OpKernel { } else { for (int cid = 0; cid < number; cid++) { iter_x_data = tmp_x + cid; - for (int imid = 0; imid < imsize; imid++, iter_x_data += C) { + int imid; + for (imid = 0; imid < imsize - (imsize % M); + imid += M, iter_x_data += M * C) { + // TODO(gaoxiang) :Because AVX/AVX2/AVX512 can not directly used + // in template class/function, before we complete high + // performance cpu vector extension, temporarily unrolling + // loop to get high precision and performance + x_mean_arr[0] += iter_x_data[0 * C]; + x_var_arr[0] += iter_x_data[0 * C] * iter_x_data[0 * C]; + x_mean_arr[1] += iter_x_data[1 * C]; + x_var_arr[1] += iter_x_data[1 * C] * iter_x_data[1 * C]; + x_mean_arr[2] += iter_x_data[2 * C]; + x_var_arr[2] += iter_x_data[2 * C] * iter_x_data[2 * C]; + x_mean_arr[3] += iter_x_data[3 * C]; + x_var_arr[3] += iter_x_data[3 * C] * iter_x_data[3 * C]; + x_mean_arr[4] += iter_x_data[4 * C]; + x_var_arr[4] += iter_x_data[4 * C] * iter_x_data[4 * C]; + x_mean_arr[5] += iter_x_data[5 * C]; + x_var_arr[5] += iter_x_data[5 * C] * iter_x_data[5 * C]; + x_mean_arr[6] += iter_x_data[6 * C]; + x_var_arr[6] += iter_x_data[6 * C] * iter_x_data[6 * C]; + x_mean_arr[7] += iter_x_data[7 * C]; + x_var_arr[7] += iter_x_data[7 * C] * iter_x_data[7 * C]; + } + x_mean = + std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean); + x_var = + std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var); + std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0)); + std::fill(x_var_arr.begin(), x_var_arr.end(), T(0)); + for (; imid < imsize; imid++, iter_x_data += C) { x_mean += iter_x_data[0]; x_var += iter_x_data[0] * iter_x_data[0]; } @@ -101,8 +168,8 @@ class GroupNormKernel : public framework::OpKernel { x_mean /= number * imsize; x_var /= number * imsize; - x_var = x_var - x_mean * x_mean; - T var_inv = 1.0 / sqrt(x_var + epsilon); + x_var = std::max(x_var - x_mean * x_mean, T(0)); + T var_inv = T(1) / std::sqrt(x_var + epsilon); mean_data[bid * groups + gid] = x_mean; var_data[bid * groups + gid] = x_var; diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc index e60b1538eee64e9eae7bdae8b7b1d6117c80d229..cce80518354d75b9caa61462a2d3cefb3fa47627 100644 --- a/paddle/fluid/operators/hinge_loss_op.cc +++ b/paddle/fluid/operators/hinge_loss_op.cc @@ -143,3 +143,10 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( hinge_loss_grad, ops::HingeLossGradKernel); + +REGISTER_OP_CUDA_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL( + hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/fluid/operators/hinge_loss_op.h b/paddle/fluid/operators/hinge_loss_op.h index 10c17a0982fd7995056aeb1f70648fd78b3d9c05..c78eddd2528117035085d7ada63bfde5798562dc 100644 --- a/paddle/fluid/operators/hinge_loss_op.h +++ b/paddle/fluid/operators/hinge_loss_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -33,9 +34,7 @@ class HingeLossKernel : public framework::OpKernel { auto y = framework::EigenVector::Flatten(*label); loss->mutable_data(context.GetPlace()); auto l = framework::EigenVector::Flatten(*loss); - l.device(place) = - (static_cast(1) - x * (static_cast(2) * y - static_cast(1))) - .cwiseMax(static_cast(0)); + EigenHingeLoss, T>::Eval(place, l, x, y); } }; @@ -59,10 +58,8 @@ class HingeLossGradKernel : public framework::OpKernel { if (dpred) { dpred->mutable_data(context.GetPlace()); auto dx = framework::EigenVector::Flatten(*dpred); - auto alt_labels = static_cast(2) * y - static_cast(1); - dx.device(place) = - dl * ((x * alt_labels) < static_cast(1)).template cast() * - (-alt_labels); + EigenHingeLossGrad, T>::Eval(place, dx, dl, + x, y); } } }; diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu index 5f86f8d72c079dd554482685403a74d14934336e..6a9183a8b465b7526f956b84b23b3d2be6c0f141 100644 --- a/paddle/fluid/operators/histogram_op.cu +++ b/paddle/fluid/operators/histogram_op.cu @@ -81,6 +81,13 @@ class HistogramCUDAKernel : public framework::OpKernel { const T* input_data = input->data(); const int input_numel = input->numel(); + int64_t* out_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, + static_cast(0)); + + if (input_data == nullptr) return; + T output_min = static_cast(minval); T output_max = static_cast(maxval); @@ -126,11 +133,6 @@ class HistogramCUDAKernel : public framework::OpKernel { "But received max is %d, min is %d", maxval, minval)); - int64_t* out_data = output->mutable_data(context.GetPlace()); - math::SetConstant()( - context.template device_context(), output, - static_cast(0)); - auto stream = context.template device_context().stream(); KernelHistogram< diff --git a/paddle/fluid/operators/histogram_op.h b/paddle/fluid/operators/histogram_op.h index 6e48c86d022bda78c5f24a53679b6437c38f0e92..a6f4448cbcb17e7b596514a967da9c7c748c69a6 100644 --- a/paddle/fluid/operators/histogram_op.h +++ b/paddle/fluid/operators/histogram_op.h @@ -38,6 +38,13 @@ class HistogramKernel : public framework::OpKernel { const T* input_data = input->data(); auto input_numel = input->numel(); + int64_t* out_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, + static_cast(0)); + + if (input_data == nullptr) return; + T output_min = static_cast(minval); T output_max = static_cast(maxval); if (output_min == output_max) { @@ -63,11 +70,6 @@ class HistogramKernel : public framework::OpKernel { "But received max is %d, min is %d", maxval, minval)); - int64_t* out_data = output->mutable_data(context.GetPlace()); - math::SetConstant()( - context.template device_context(), output, - static_cast(0)); - for (int64_t i = 0; i < input_numel; i++) { if (input_data[i] >= output_min && input_data[i] <= output_max) { const int64_t bin = (int64_t)((input_data[i] - output_min) * nbins / diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index b973d5d9d8fe16ffb0faab83576bd5f71a16474c..d248857b8f42fb9e8a6c8a0ac60546a390597714 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -192,3 +192,10 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( im2sequence_grad, ops::Im2SequenceGradKernel); + +REGISTER_OP_CUDA_KERNEL( + im2sequence, + ops::Im2SequenceKernel); +REGISTER_OP_CUDA_KERNEL( + im2sequence_grad, + ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.cu b/paddle/fluid/operators/im2sequence_op.cu deleted file mode 100644 index 1c34640618d58d3b5fe627fa6596260a7b687d05..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/im2sequence_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -#include "paddle/fluid/operators/im2sequence_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - im2sequence, - ops::Im2SequenceKernel); -REGISTER_OP_CUDA_KERNEL( - im2sequence_grad, - ops::Im2SequenceGradKernel); diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h index 9c9069b722763d0ec0d39d2f6fb35477c7578f30..760d6a63de13ac72a578e565c1bea8fc58130eb9 100644 --- a/paddle/fluid/operators/im2sequence_op.h +++ b/paddle/fluid/operators/im2sequence_op.h @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/math_function.h" @@ -157,7 +158,7 @@ class Im2SequenceGradKernel : public framework::OpKernel { auto x_v = framework::EigenVector::Flatten(*d_x); auto& place = *ctx.template device_context().eigen_device(); - x_v.device(place) = x_v.constant(0.0); + EigenConstant, T, 1>::Eval(place, x_v, 0.0); auto in_dim = in->dims(); int batch_size = in_dim[0]; diff --git a/paddle/fluid/operators/imag_op.cc b/paddle/fluid/operators/imag_op.cc index 899025ae7093b45833805687c9d499e2d1fa02e7..6a195bb9400e89ef09bc7ca2c08637eeb505dda2 100644 --- a/paddle/fluid/operators/imag_op.cc +++ b/paddle/fluid/operators/imag_op.cc @@ -96,11 +96,11 @@ REGISTER_OPERATOR(imag, ops::ImagOp, ops::ImagOpMaker, REGISTER_OPERATOR(imag_grad, ops::ImagGradOp); REGISTER_OP_CPU_KERNEL(imag, ops::ImagKernel, + paddle::platform::complex>, ops::ImagKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL(imag_grad, ops::ImagGradKernel, + paddle::platform::complex>, ops::ImagGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/imag_op.cu b/paddle/fluid/operators/imag_op.cu index a7a3b1368219891dc5d98e25f4c38be5ad216baf..9cfb2ef7f2fef6b25322ba76bedadae3c6ca8d87 100644 --- a/paddle/fluid/operators/imag_op.cu +++ b/paddle/fluid/operators/imag_op.cu @@ -18,11 +18,11 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(imag, ops::ImagKernel, + paddle::platform::complex>, ops::ImagKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL(imag_grad, ops::ImagGradKernel, + paddle::platform::complex>, ops::ImagGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index e8edfb99f9f306d7057afcdf935cad5a5e4a73d6..e727f6ceb56f7e53d5828dad5bde8d11f05df379 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -107,3 +107,9 @@ REGISTER_OP_CPU_KERNEL( ops::IncrementKernel, ops::IncrementKernel, ops::IncrementKernel); + +REGISTER_OP_CUDA_KERNEL( + increment, ops::IncrementKernel, + ops::IncrementKernel, + ops::IncrementKernel, + ops::IncrementKernel); diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h index d0e8c66255ef68b975701fb6b3c145be2590e271..4b9d07146484ff00ba105b9971f40f91dd8148de 100644 --- a/paddle/fluid/operators/increment_op.h +++ b/paddle/fluid/operators/increment_op.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -30,8 +31,9 @@ class IncrementKernel : public framework::OpKernel { out_tensor->mutable_data(context.GetPlace()); auto& dev = *context.template device_context().eigen_device(); - framework::EigenScalar::From(*out_tensor).device(dev) = - framework::EigenScalar::From(*x_tensor) + static_cast(step); + EigenAdd, T>::Eval( + dev, framework::EigenScalar::From(*out_tensor), + framework::EigenScalar::From(*x_tensor), static_cast(step)); } }; diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc index 7d75e385e8f3b7c88c393c7195b49e17397f08aa..35ebe92b364d3cf241c3778687b0d4123700c56b 100644 --- a/paddle/fluid/operators/increment_op_npu.cc +++ b/paddle/fluid/operators/increment_op_npu.cc @@ -43,7 +43,7 @@ class IncrementalNPUKernel : public framework::OpKernel { step_tensor.mutable_data({1}, context.GetPlace()); FillNpuTensorWithConstant(&step_tensor, static_cast(step)); - auto runner = + const auto& runner = NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {}); auto stream = diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 6c488c387f81500bf12b9a7cc8102944ffb301c4..445d129d07c14b8300a04ac311501f96c96c2175 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -88,8 +88,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { platform::errors::InvalidArgument( "OutSize's dimension size must be 1, but got dimention = %d .", out_size_dim.size())); - PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument( - "OutSize's dim[0] must be 1")); + PADDLE_ENFORCE_EQ( + out_size_dim[0], 1, + platform::errors::InvalidArgument( + "OutSize's 0-th dimension's value must be 1, but got value = %d .", + out_size_dim[0])); ctx->ShareLoD("X", "Out"); return; } diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index cb93044ca58445dcb4817629ef859e312f900983..97e39e71a556971fb16e3f2abce7a3bf93f17137 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -35,7 +35,12 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { interp_method)); const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); - + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument( + "The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, dim_x[i])); + } if (ctx->HasInputs("SizeTensor")) { // top prority size auto inputs_name = ctx->Inputs("SizeTensor"); @@ -76,9 +81,12 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { if (scale.size() > 0) { float scale_w = -1; scale_w = scale[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); if (scale_w > 0.) { // round down out_w = (data_layout == DataLayout::kNCHW @@ -99,8 +107,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { platform::errors::InvalidArgument( "OutSize's dimension size must be 1, but got dimention = %d .", out_size_dim.size())); - PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument( - "OutSize's dim[0] must be 1")); + PADDLE_ENFORCE_EQ( + out_size_dim[0], 1, + platform::errors::InvalidArgument( + "OutSize's 0-th dimension's value must be 1, but got value = %d .", + out_size_dim[0])); ctx->ShareLoD("X", "Out"); return; } @@ -128,6 +139,13 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument( + "The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, dim_x[i])); + } + if (ctx->HasInputs("SizeTensor")) { // top prority size auto inputs_name = ctx->Inputs("SizeTensor"); @@ -173,9 +191,17 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { scale_h = scale[0]; scale_w = scale[1]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); if (scale_h > 0. && scale_w > 0.) { // round down out_h = (data_layout == DataLayout::kNCHW @@ -232,6 +258,13 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) { const DataLayout data_layout = framework::StringToDataLayout( ctx->Attrs().Get("data_layout")); + for (int i = 0; i < dim_x.size(); ++i) { + PADDLE_ENFORCE_NE(dim_x[i], 0, platform::errors::InvalidArgument( + "The shape of input(x) should be larged " + "than 0, bug received shape[%d] is %d ", + i, dim_x[i])); + } + if (ctx->HasInputs("SizeTensor")) { // top prority size auto inputs_name = ctx->Inputs("SizeTensor"); @@ -281,9 +314,23 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) { scale_h = scale[1]; scale_w = scale[2]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { // round down out_d = (data_layout == DataLayout::kNCHW diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index e5002e72d0edd7854bdbcc57713c20b5fec28eaf..6745592c5c1a8bb951059c55901e691ed274601e 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -982,15 +982,21 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx, if (scale_tensor != nullptr) { auto scale_data = get_new_data_from_tensor(scale_tensor); scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } else { if (scale.size() > 0) { scale_w = scale[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } } if (scale_w > 0.) { @@ -1081,18 +1087,36 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, scale_h = scale_data[0]; scale_w = scale_data[0]; } + PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } else { if (scale.size() > 1) { scale_w = scale[1]; scale_h = scale[0]; + PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } } if (scale_w > 0. && scale_h > 0.) { @@ -1216,10 +1240,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, scale_h = scale_data[0]; scale_w = scale_data[0]; } + + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } else { if (scale.size() > 1) { scale_d = scale[0]; @@ -1227,9 +1266,23 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, scale_w = scale[2]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } } if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { @@ -1334,16 +1387,22 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx, if (scale_tensor != nullptr) { auto scale_data = get_new_data_from_tensor(scale_tensor); scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } else { if (scale.size() > 0) { scale_w = scale[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } } if (scale_w > 0.) { @@ -1433,19 +1492,36 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, scale_h = scale_data[0]; scale_w = scale_data[0]; } + + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } else { if (scale.size() > 1) { scale_w = scale[1]; scale_h = scale[0]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } } if (scale_w > 0. && scale_h > 0.) { @@ -1581,9 +1657,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, scale_w = scale_data[0]; } PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } else { if (scale.size() > 1) { scale_d = scale[0]; @@ -1591,9 +1681,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, scale_w = scale[2]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } } if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc index dab9948edc3592e8c1635c5bb62b7dfbd09dd1e1..308330313a976997df9547abc9db6ec091718543 100644 --- a/paddle/fluid/operators/kron_op.cc +++ b/paddle/fluid/operators/kron_op.cc @@ -18,8 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -185,9 +184,9 @@ REGISTER_OP_CPU_KERNEL( ops::KronKernel, ops::KronKernel, ops::KronKernel, + paddle::platform::complex>, ops::KronKernel); + paddle::platform::complex>); REGISTER_OPERATOR(kron_grad, ops::KronGradOp); REGISTER_OP_CPU_KERNEL( @@ -198,6 +197,6 @@ REGISTER_OP_CPU_KERNEL( ops::KronGradKernel, ops::KronGradKernel, ops::KronGradKernel, + paddle::platform::complex>, ops::KronGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/kron_op.cu b/paddle/fluid/operators/kron_op.cu index a348cb2e1759e8ad8c2f70c7c25478c94e35e786..e5124e65007509568ae8cd8ab65b33c504a12fe9 100644 --- a/paddle/fluid/operators/kron_op.cu +++ b/paddle/fluid/operators/kron_op.cu @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/kron_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -26,9 +25,9 @@ REGISTER_OP_CUDA_KERNEL( ops::KronKernel, ops::KronKernel, ops::KronKernel, + paddle::platform::complex>, ops::KronKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( kron_grad, ops::KronGradKernel, @@ -38,6 +37,6 @@ REGISTER_OP_CUDA_KERNEL( ops::KronGradKernel, ops::KronGradKernel, ops::KronGradKernel, + paddle::platform::complex>, ops::KronGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/kron_op.h b/paddle/fluid/operators/kron_op.h index 6815fd460fa1f1969c9bf01f733f30b941fd8799..ea2050fe8e61e7d36c40760e66eb6b3def8d3246 100644 --- a/paddle/fluid/operators/kron_op.h +++ b/paddle/fluid/operators/kron_op.h @@ -26,9 +26,6 @@ limitations under the License. */ namespace paddle { namespace operators { -using complex64 = paddle::platform::complex64; -using complex128 = paddle::platform::complex128; - // Process an element in the output, used with a parallel-for template struct KronElemFunctor { @@ -175,72 +172,13 @@ struct KronGradElemFunctor { const int ndims_; }; -template <> -struct KronGradElemFunctor { - KronGradElemFunctor(const complex64* dout, const complex64* A, - const complex64* B, complex64* dout_a, complex64* dout_b, - const int64_t* stride_dout, const int64_t* stride_a, - const int64_t* stride_b, const int64_t* shape_b, - const int64_t numel_a, const int64_t numel_b, - const int ndims) - : dout_(dout), - A_(A), - B_(B), - dout_a_(dout_a), - dout_b_(dout_b), - stride_dout_(stride_dout), - stride_a_(stride_a), - stride_b_(stride_b), - shape_b_(shape_b), - numel_a_(numel_a), - numel_b_(numel_b), - ndims_(ndims) {} - - HOSTDEVICE void operator()(int64_t idx) { - int64_t index = idx; - int64_t index_a = 0; - int64_t index_b = 0; - for (int i = 0; i < ndims_; i++) { - auto pos_i = index / stride_dout_[i]; - index = index % stride_dout_[i]; - auto pos_ai = pos_i / shape_b_[i]; - auto pos_bi = pos_i % shape_b_[i]; - index_a += stride_a_[i] * pos_ai; - index_b += stride_b_[i] * pos_bi; - } - - if (dout_a_) { - size_t index_out_a = index_a * numel_b_ + index_b; - dout_a_[index_out_a] = - dout_[idx] * complex64(B_[index_b].real, -B_[index_b].imag); - } - if (dout_b_) { - size_t index_out_b = index_b * numel_a_ + index_a; - dout_b_[index_out_b] = - dout_[idx] * complex64(A_[index_a].real, -A_[index_a].imag); - } - } - - private: - const complex64* dout_; - const complex64* A_; - const complex64* B_; - complex64* dout_a_; - complex64* dout_b_; - const int64_t* stride_dout_; - const int64_t* stride_a_; - const int64_t* stride_b_; - const int64_t* shape_b_; - const int64_t numel_a_; - const int64_t numel_b_; - const int ndims_; -}; - -template <> -struct KronGradElemFunctor { - KronGradElemFunctor(const complex128* dout, const complex128* A, - const complex128* B, complex128* dout_a, - complex128* dout_b, const int64_t* stride_dout, +template +struct KronGradElemFunctor> { + KronGradElemFunctor(const platform::complex* dout, + const platform::complex* A, + const platform::complex* B, + platform::complex* dout_a, + platform::complex* dout_b, const int64_t* stride_dout, const int64_t* stride_a, const int64_t* stride_b, const int64_t* shape_b, const int64_t numel_a, const int64_t numel_b, const int ndims) @@ -273,21 +211,23 @@ struct KronGradElemFunctor { if (dout_a_) { size_t index_out_a = index_a * numel_b_ + index_b; dout_a_[index_out_a] = - dout_[idx] * complex128(B_[index_b].real, -B_[index_b].imag); + dout_[idx] * + platform::complex(B_[index_b].real, -B_[index_b].imag); } if (dout_b_) { size_t index_out_b = index_b * numel_a_ + index_a; dout_b_[index_out_b] = - dout_[idx] * complex128(A_[index_a].real, -A_[index_a].imag); + dout_[idx] * + platform::complex(A_[index_a].real, -A_[index_a].imag); } } private: - const complex128* dout_; - const complex128* A_; - const complex128* B_; - complex128* dout_a_; - complex128* dout_b_; + const platform::complex* dout_; + const platform::complex* A_; + const platform::complex* B_; + platform::complex* dout_a_; + platform::complex* dout_b_; const int64_t* stride_dout_; const int64_t* stride_a_; const int64_t* stride_b_; @@ -297,11 +237,13 @@ struct KronGradElemFunctor { const int ndims_; }; -template struct IdentityFunctor { HOSTDEVICE explicit inline IdentityFunctor() {} - HOSTDEVICE inline T operator()(const T& x) const { return x; } + template + HOSTDEVICE inline U operator()(const U& x) const { + return x; + } }; template @@ -372,13 +314,13 @@ struct KronGradOpFunctor { #if defined(__NVCC__) || defined(__HIPCC__) auto stream = dev_ctx.stream(); // it is a cuda device_context if (dx) { - TensorReduce>( - dout_x, dx, {1}, static_cast(0), cub::Sum(), IdentityFunctor(), + TensorReduce( + dout_x, dx, {1}, static_cast(0), cub::Sum(), IdentityFunctor(), stream); } if (dy) { - TensorReduce>( - dout_y, dy, {1}, static_cast(0), cub::Sum(), IdentityFunctor(), + TensorReduce( + dout_y, dy, {1}, static_cast(0), cub::Sum(), IdentityFunctor(), stream); } #else diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index e8f83f6b62221b9db14734917a1a2e44d8295f6e..ddd0554add5105b0e682c6cb2e42ac4ec936c448 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -91,3 +91,9 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( l1_norm_grad, ops::L1NormGradKernel); + +REGISTER_OP_CUDA_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/fluid/operators/l1_norm_op.h b/paddle/fluid/operators/l1_norm_op.h index c2a302ed05f1c63864629665110e29c60cedb796..918526914d95d8a91d121b7c17629c10ab4dee16 100644 --- a/paddle/fluid/operators/l1_norm_op.h +++ b/paddle/fluid/operators/l1_norm_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -33,7 +34,7 @@ class L1NormKernel : public framework::OpKernel { auto &place = *context.template device_context().eigen_device(); - out.device(place) = x.abs().sum(); + EigenL1Norm, T>::Eval(place, out, x); } }; @@ -59,8 +60,9 @@ class L1NormGradKernel : public framework::OpKernel { auto &place = *context.template device_context().eigen_device(); - Eigen::DSizes x_dsize(x->numel()); - dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign(); + Eigen::DSizes x_dsize(x->numel()); + EigenL1NormGrad, T>::Eval( + place, dx_eigen, d_out_eigen, x_eigen, x_dsize); } }; diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index 3656de3525d32cac814e4199089de56b40ea09d8..6cd6a524e281dbc3b97a714b5bc2099aa9905c76 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -42,15 +42,45 @@ using CudnnDataType = platform::CudnnDataType; template using LayerNormParamType = typename CudnnDataType::BatchNormParamType; -inline static int GetDesiredBlockDim(int block_dim) { +inline static int GetDesiredBlockDim(int64_t block_dim) { #ifdef __HIPCC__ const int kMaxBlockDim = 256; + const int lwarpSize = 64; #else const int kMaxBlockDim = 512; + const int lwarpSize = 32; #endif - return block_dim >= kMaxBlockDim - ? kMaxBlockDim - : (1 << (static_cast(std::log2f(block_dim)))); + return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize; +} + +template +static __forceinline__ __device__ U WarpReduceSum(U val) { + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + for (int offset = warpSize / 2; offset > 0; offset /= 2) { + val += paddle::platform::CudaShuffleDownSync(mask, val, offset); + } + return val; +} + +template +__forceinline__ __device__ U BlockReduceSum(U val, U *shared) { + int lane = threadIdx.x % warpSize; + int wid = threadIdx.x / warpSize; + + val = WarpReduceSum(val); // Each warp performs partial reduction + + __syncthreads(); + if (lane == 0) shared[wid] = val; // Write reduced value to shared memory + + __syncthreads(); // Wait for all partial reductions + // read from shared memory only if that warp existed + val = + (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast(0); + + if (wid == 0) val = WarpReduceSum(val); // Final reduce within first warp + + return val; } #define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ @@ -70,15 +100,17 @@ inline static int GetDesiredBlockDim(int block_dim) { FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__) -#define FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE( \ - log2_block_dim, feature_size, kMaxBlockNum, ...) \ - case (1 << (log2_block_dim)): { \ - for (int i = 0; i < std::ceil(feature_size / (1.0 * kMaxBlockNum)); i++) { \ - int col_offset = i * kMaxBlockNum; \ - int block_num = std::min(feature_size - col_offset, kMaxBlockNum); \ - constexpr auto kBlockDim = (1 << (log2_block_dim)); \ - __VA_ARGS__; \ - } \ +#define FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE_BASE( \ + log2_block_dim, feature_size, kMaxBlockNum, ...) \ + case (1 << (log2_block_dim)): { \ + for (int64_t i = 0; i < std::ceil(feature_size / (1.0 * kMaxBlockNum)); \ + i++) { \ + int64_t col_offset = i * static_cast(kMaxBlockNum); \ + int block_num = static_cast(std::min( \ + feature_size - col_offset, static_cast(kMaxBlockNum))); \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } \ } break #define FIXED_BLOCK_DIM_FIXED_BLOCK_NUM_CASE(feature_size, kMaxBlockNum, ...) \ @@ -147,31 +179,35 @@ __inline__ __device__ half rsqrt_(const half val) { template __global__ void LayerNormForward(const T *x, const U *scale, const U *bias, T *y, U *mean, U *var, float epsilon, - int feature_size) { - using BlockReduce = cub::BlockReduce, BlockDim>; - __shared__ typename BlockReduce::TempStorage temp_storage; + int64_t feature_size) { __shared__ U mean_share; __shared__ U var_share; + __shared__ U shared_mean[32]; // threadIdx.x / warpSize <= kMaxBlockDim / + // warpSize <= 1024/32 = 32; + __shared__ U shared_var[32]; - int beg_idx = blockIdx.x * feature_size + threadIdx.x; - int end_idx = (blockIdx.x + 1) * feature_size; + int64_t beg_idx = blockIdx.x * feature_size + threadIdx.x; + int64_t end_idx = (blockIdx.x + 1) * feature_size; // Step 1: Reduce to calculate mean and var U mean_val = 0; U var_val = 0; - for (int i = beg_idx; i < end_idx; i += BlockDim) { + for (int64_t i = beg_idx; i < end_idx; i += BlockDim) { U tmp = static_cast(x[i]); mean_val += tmp; var_val += (tmp * tmp); } - auto pair = BlockReduce(temp_storage) - .Reduce(PairForLayerNorm(mean_val, var_val), - PairForLayerNormAddFunctor()); + + mean_val = BlockReduceSum(mean_val, shared_mean); + var_val = BlockReduceSum(var_val, shared_var); + if (threadIdx.x == 0) { - auto tmp = pair.first_ / feature_size; + auto scale = static_cast(1.) / static_cast(feature_size); + auto tmp = mean_val * scale; mean[blockIdx.x] = mean_share = static_cast(tmp); - var[blockIdx.x] = var_share = - static_cast(pair.second_ / feature_size - tmp * tmp); + var_share = static_cast(var_val * scale - mean_share * mean_share); + var_share = var_share > U(0) ? var_share : U(0); + var[blockIdx.x] = var_share; } __syncthreads(); @@ -181,13 +217,13 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias, // Step 2: Calculate y if (scale != nullptr) { if (bias != nullptr) { - for (int i = beg_idx, j = threadIdx.x; i < end_idx; + for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { y[i] = static_cast( scale[j] * (static_cast(x[i]) - mean_val) * invvar + bias[j]); } } else { - for (int i = beg_idx, j = threadIdx.x; i < end_idx; + for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { y[i] = static_cast(scale[j] * (static_cast(x[i]) - mean_val) * invvar); @@ -195,13 +231,13 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias, } } else { // scale == nullptr if (bias != nullptr) { - for (int i = beg_idx, j = threadIdx.x; i < end_idx; + for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { y[i] = static_cast((static_cast(x[i]) - mean_val) * invvar + bias[j]); } } else { - for (int i = beg_idx, j = threadIdx.x; i < end_idx; + for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx; i += BlockDim, j += BlockDim) { y[i] = static_cast((static_cast(x[i]) - mean_val) * invvar); } @@ -211,18 +247,18 @@ __global__ void LayerNormForward(const T *x, const U *scale, const U *bias, template __inline__ __device__ void cuLoadAddStridedInputs( - const int i1_block, const int thr_load_row_off, const int thr_load_col_off, - const int i2_off, const int row_stride, U *warp_buf1, U *warp_buf2, - const T *input, const T *dout, const int i1_end, const int n2, - const U *__restrict__ mean, const U *__restrict__ var, - const float epsilon) { - const int i1 = i1_block + thr_load_row_off; + const int64_t i1_block, const int thr_load_row_off, + const int thr_load_col_off, const int i2_off, const int row_stride, + U *warp_buf1, U *warp_buf2, const T *input, const T *dout, + const int64_t i1_end, const int64_t n2, const U *__restrict__ mean, + const U *__restrict__ var, const float epsilon) { + const int64_t i1 = i1_block + thr_load_row_off; if (i1 >= i1_end) return; U curr_mean = mean[i1]; U curr_invvar = rsqrt_(var[i1] + epsilon); for (int k = 0; k < VPT; ++k) { const int i2 = i2_off + k; - const int load_idx = i1 * n2 + i2; + const int64_t load_idx = i1 * n2 + i2; const int write_idx = thr_load_row_off * row_stride + thr_load_col_off + k; if (i2 < n2) { U curr_input = static_cast(input[load_idx]); @@ -236,8 +272,8 @@ __inline__ __device__ void cuLoadAddStridedInputs( template __global__ void LayerNormBackwardPartGradGammaBeta( - const T *__restrict__ dout, const T *__restrict__ input, const int n1, - const int n2, const U *__restrict__ mean, const U *__restrict__ var, + const T *__restrict__ dout, const T *__restrict__ input, const int64_t n1, + const int64_t n2, const U *__restrict__ mean, const U *__restrict__ var, float epsilon, U *part_grad_gamma, U *part_grad_beta) { // VPTX -> value per thread.x, BDIMX -> blockDim.x, BDIMY -> blockDim.y, BDIMX // -> blockDim.x @@ -263,7 +299,7 @@ __global__ void LayerNormBackwardPartGradGammaBeta( } __syncthreads(); - for (int i1_block = blockIdx.y * BDIMY * VPTX; i1_block < n1; + for (int64_t i1_block = blockIdx.y * BDIMY * VPTX; i1_block < n1; i1_block += VPTX * BDIMY * gridDim.y) { cuLoadAddStridedInputs( i1_block, thr_load_row_off, thr_load_col_off, i2_off, row_stride, @@ -296,7 +332,7 @@ __global__ void LayerNormBackwardPartGradGammaBeta( } __syncthreads(); } - int i2 = blockIdx.x * blockDim.x + threadIdx.x; + int64_t i2 = blockIdx.x * blockDim.x + threadIdx.x; if (threadIdx.y == 0 && i2 < n2) { int row1 = threadIdx.y; int row2 = threadIdx.y + 1; @@ -314,7 +350,7 @@ __global__ void LayerNormBackwardSumGradGammaBeta( const int n1, const int n2, U *grad_gamma, U *grad_beta) { // sum partial gradients for gamma and beta __shared__ U buf[BDIMX * BDIMY]; - int i2 = blockIdx.x * BDIMX + threadIdx.x; + int64_t i2 = blockIdx.x * BDIMX + threadIdx.x; if (i2 < n2) { // each warp does sequential reductions until reduced part_size is num_warps int num_warp_reductions = part_size / BDIMY; @@ -364,9 +400,9 @@ __global__ void LayerNormBackwardComputeGradInput( const U *__restrict__ mean, const U *__restrict__ var, const float epsilon, const U *gamma, T *grad_input) { #ifdef __HIPCC__ - for (auto i1 = hipBlockIdx_y; i1 < n1; i1 += hipGridDim_y) { + for (auto i1 = hipBlockIdx_x; i1 < n1; i1 += hipGridDim_x) { #else - for (auto i1 = blockIdx.y; i1 < n1; i1 += gridDim.y) { + for (auto i1 = blockIdx.x; i1 < n1; i1 += gridDim.x) { #endif U sum_loss1 = U(0); U sum_loss2 = U(0); @@ -485,22 +521,17 @@ __global__ void LayerNormBackwardComputeGradInput( // Make sure that d_scale != nullptr && d_bias != nullptr // Since d_scale != nullptr, scale would not be nullptr template -__global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, - U *d_scale, U *d_bias, T *d_x, - const U *mean, const U *var, - const U *scale, float epsilon, - int batch_size, int feature_size, - int col_offset) { - using BlockReduce = cub::BlockReduce, BlockDim>; - __shared__ typename BlockReduce::TempStorage temp_storage; - - int beg_idx = threadIdx.x * feature_size + (blockIdx.x + col_offset); - int end_idx = batch_size * feature_size + (blockIdx.x + col_offset); - int stride = BlockDim * feature_size; +__global__ void LayerNormBackwardGradientAll( + const T *x, const T *d_y, U *d_scale, U *d_bias, T *d_x, const U *mean, + const U *var, const U *scale, float epsilon, int64_t batch_size, + int64_t feature_size, int64_t col_offset) { + int64_t beg_idx = threadIdx.x * feature_size + (blockIdx.x + col_offset); + int64_t end_idx = batch_size * feature_size + (blockIdx.x + col_offset); + int64_t stride = BlockDim * feature_size; U d_scale_partial = static_cast(0), d_bias_partial = static_cast(0); - for (int i = beg_idx; i < end_idx; i += stride) { + for (int64_t i = beg_idx; i < end_idx; i += stride) { int row_idx = i / feature_size; auto var_val = real_sqrt(static_cast(var[row_idx]) + epsilon); d_scale_partial += static_cast(d_y[i]) * @@ -512,13 +543,15 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, } } - auto pair = BlockReduce(temp_storage) - .Reduce(PairForLayerNorm(d_scale_partial, d_bias_partial), - PairForLayerNormAddFunctor()); + __shared__ U shared_scale[32]; // threadIdx.x / warpSize <= kMaxBlockDim / + // warpSize <= 1024/32 = 32; + __shared__ U shared_bias[32]; + d_scale_partial = BlockReduceSum(d_scale_partial, shared_scale); + d_bias_partial = BlockReduceSum(d_bias_partial, shared_bias); if (threadIdx.x == 0) { - d_scale[blockIdx.x + col_offset] = pair.first_; - d_bias[blockIdx.x + col_offset] = pair.second_; + d_scale[blockIdx.x + col_offset] = d_scale_partial; + d_bias[blockIdx.x + col_offset] = d_bias_partial; } } @@ -528,16 +561,16 @@ __global__ void LayerNormBackwardGradientAll(const T *x, const T *d_y, template __global__ void LayerNormBackwardGradientScaleOrBias( const T *x, const T *d_y, U *d_scale, U *d_bias, T *d_x, const U *mean, - const U *var, const U *scale, float epsilon, int batch_size, - int feature_size, int col_offset) { + const U *var, const U *scale, float epsilon, int64_t batch_size, + int64_t feature_size, int col_offset) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; - int beg_idx = threadIdx.x * feature_size + blockIdx.x + col_offset; - int end_idx = batch_size * feature_size + blockIdx.x + col_offset; + int64_t beg_idx = threadIdx.x * feature_size + blockIdx.x + col_offset; + int64_t end_idx = batch_size * feature_size + blockIdx.x + col_offset; int stride = BlockDim * feature_size; U d_scale_or_d_bias_partial = static_cast(0); - for (int i = beg_idx; i < end_idx; i += stride) { + for (int64_t i = beg_idx; i < end_idx; i += stride) { int row_idx = i / feature_size; auto var_val = static_cast(real_sqrt(static_cast(var[row_idx]) + epsilon)); @@ -572,22 +605,20 @@ __global__ void LayerNormBackwardGradientScaleOrBias( } template -__global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x, - const U *mean, - const U *var, - float epsilon, - int feature_size) { +__global__ void LayerNormBackwardPostProcessToCalculateDX( + const T *x, T *d_x, const U *mean, const U *var, float epsilon, + int64_t feature_size) { using BlockReduce = cub::BlockReduce, BlockDim>; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ U d_x_reduce_tmp[2]; - int beg_idx = blockIdx.x * feature_size + threadIdx.x; - int end_idx = (blockIdx.x + 1) * feature_size; + int64_t beg_idx = blockIdx.x * feature_size + threadIdx.x; + int64_t end_idx = (blockIdx.x + 1) * feature_size; U block_mean = mean[blockIdx.x]; U block_var = var[blockIdx.x]; U d_x_mean_partial = static_cast(0), d_x_var_partial = static_cast(0); - for (int i = beg_idx; i < end_idx; i += BlockDim) { + for (int64_t i = beg_idx; i < end_idx; i += BlockDim) { d_x_mean_partial += static_cast(d_x[i]); d_x_var_partial += static_cast(d_x[i]) * (static_cast(x[i]) - block_mean); @@ -608,7 +639,7 @@ __global__ void LayerNormBackwardPostProcessToCalculateDX(const T *x, T *d_x, d_x_mean_partial = d_x_reduce_tmp[0]; d_x_var_partial = d_x_reduce_tmp[1]; - for (int i = beg_idx; i < end_idx; i += BlockDim) { + for (int64_t i = beg_idx; i < end_idx; i += BlockDim) { d_x[i] -= static_cast(d_x_mean_partial); d_x[i] -= static_cast((static_cast(x[i]) - block_mean) * d_x_var_partial); @@ -621,17 +652,17 @@ __global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y, T *d_x, const U *mean, const U *var, const U *scale, float epsilon, - int feature_size) { + int64_t feature_size) { using BlockReduce = cub::BlockReduce, BlockDim>; __shared__ typename BlockReduce::TempStorage temp_storage; __shared__ U d_x_reduce_tmp[2]; - int beg_idx = blockIdx.x * feature_size + threadIdx.x; - int end_idx = (blockIdx.x + 1) * feature_size; + int64_t beg_idx = blockIdx.x * feature_size + threadIdx.x; + int64_t end_idx = (blockIdx.x + 1) * feature_size; U block_mean = mean[blockIdx.x], block_var = var[blockIdx.x]; U d_x_mean_partial = static_cast(0), d_x_var_partial = static_cast(0); - for (int i = beg_idx; i < end_idx; i += BlockDim) { + for (int64_t i = beg_idx; i < end_idx; i += BlockDim) { auto var_val = static_cast(real_sqrt(static_cast(block_var) + epsilon)); if (scale != nullptr) { @@ -661,7 +692,7 @@ __global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y, d_x_mean_partial = d_x_reduce_tmp[0]; d_x_var_partial = d_x_reduce_tmp[1]; - for (int i = beg_idx; i < end_idx; i += BlockDim) { + for (int64_t i = beg_idx; i < end_idx; i += BlockDim) { d_x[i] -= static_cast(d_x_mean_partial); d_x[i] -= static_cast((static_cast(x[i]) - block_mean) * d_x_var_partial); @@ -671,8 +702,8 @@ __global__ void LayerNormBackwardGradientOnlyDX(const T *x, const T *d_y, template __global__ void LayerNormBackwardWhenBatchSizeIsOne( const T *x, const T *d_y, T *d_x, U *d_scale, U *d_bias, const U *mean, - const U *var, const U *scale, float epsilon, int feature_size) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; + const U *var, const U *scale, float epsilon, int64_t feature_size) { + int64_t idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < feature_size) { auto var_val = static_cast(real_sqrt(static_cast(var[idx]) + epsilon)); @@ -697,8 +728,8 @@ __global__ void LayerNormBackwardWhenBatchSizeIsOne( template static void LayerNormBackward(const T *x, const T *d_y, const U *scale, const U *mean, const U *var, T *d_x, U *d_scale, - U *d_bias, float epsilon, int batch_size, - int feature_size, + U *d_bias, float epsilon, int64_t batch_size, + int64_t feature_size, const framework::ExecutionContext &ctx) { auto &dev_ctx = ctx.cuda_device_context(); auto stream = dev_ctx.stream(); @@ -838,9 +869,8 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale, constexpr int BDIMX1 = 32; constexpr int BDIMY1 = 4; dim3 threads1(BDIMX1, BDIMY1, 1); - const dim3 blocks1(1, batch_size, 1); LayerNormBackwardComputeGradInput< - T, U, BDIMX1, BDIMY1><<>>( + T, U, BDIMX1, BDIMY1><<>>( d_y, x, batch_size, feature_size, mean, var, epsilon, scale, d_x); break; } @@ -858,8 +888,8 @@ void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, int begin_norm_axis, float eps) { const auto x_dims = framework::make_ddim(input_shape); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int batch_size = static_cast(matrix_dim[0]); - int feature_size = static_cast(matrix_dim[1]); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); switch (GetDesiredBlockDim(feature_size)) { FIXED_BLOCK_DIM_CASE( LayerNormForward<<>>( @@ -897,8 +927,8 @@ class LayerNormKernel auto *bias_data = (bias == nullptr ? nullptr : bias->data()); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int batch_size = static_cast(matrix_dim[0]); - int feature_size = static_cast(matrix_dim[1]); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); auto stream = ctx.cuda_device_context().stream(); @@ -951,8 +981,8 @@ class LayerNormGradKernel const auto &x_dims = x->dims(); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int batch_size = static_cast(matrix_dim[0]); - int feature_size = static_cast(matrix_dim[1]); + int64_t batch_size = static_cast(matrix_dim[0]); + int64_t feature_size = static_cast(matrix_dim[1]); LayerNormBackward(x_data, d_y_data, scale_data, mean_data, var_data, d_x_data, d_scale_data, d_bias_data, epsilon, diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc index c0c228ef22af3e24f1ea6e1bc8607cda718ed40e..4aafe2856605e140aa9bd154c9183682b63eca6b 100644 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ b/paddle/fluid/operators/layer_norm_op_npu.cc @@ -81,7 +81,7 @@ class LayerNormNPUKernel : public framework::OpKernel { Tensor value(x->type()); value.mutable_data({1}, place); FillNpuTensorWithConstant(&value, static_cast(1.0)); - auto runner = + const auto& runner = NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); runner.Run(stream); scale = &default_scale; @@ -95,7 +95,7 @@ class LayerNormNPUKernel : public framework::OpKernel { Tensor value(x->type()); value.mutable_data({1}, place); FillNpuTensorWithConstant(&value, static_cast(0)); - auto runner = + const auto& runner = NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); runner.Run(stream); bias = &default_bias; @@ -110,7 +110,7 @@ class LayerNormNPUKernel : public framework::OpKernel { cast_scale.Resize(scale->dims()); cast_scale.mutable_data(ctx.GetPlace()); auto dst_dtype = ConvertToNpuDtype(x->type()); - auto runner_cast_scale = + const auto& runner_cast_scale = NpuOpRunner("Cast", {*scale}, {cast_scale}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_scale.Run(stream); @@ -125,7 +125,7 @@ class LayerNormNPUKernel : public framework::OpKernel { cast_bias.Resize(bias->dims()); cast_bias.mutable_data(ctx.GetPlace()); auto dst_dtype = ConvertToNpuDtype(x->type()); - auto runner_cast_bias = + const auto& runner_cast_bias = NpuOpRunner("Cast", {*bias}, {cast_bias}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_bias.Run(stream); @@ -163,18 +163,18 @@ class LayerNormNPUKernel : public framework::OpKernel { variance->mutable_data(ctx.GetPlace()); } - auto runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias}, - {*y, *tmp_mean, *tmp_variance}, - {{"begin_norm_axis", begin_norm_axis}, - {"begin_params_axis", begin_norm_axis}, - {"epsilon", epsilon}}); + const auto& runner = NpuOpRunner("LayerNorm", {*x, cast_scale, cast_bias}, + {*y, *tmp_mean, *tmp_variance}, + {{"begin_norm_axis", begin_norm_axis}, + {"begin_params_axis", begin_norm_axis}, + {"epsilon", epsilon}}); runner.Run(stream); // cast back from FP16 to FP32 if (x->type() == framework::proto::VarType::FP16 && mean->type() == framework::proto::VarType::FP32) { auto dst_dtype = ConvertToNpuDtype(mean->type()); - auto runner_cast_mean = + const auto& runner_cast_mean = NpuOpRunner("Cast", {*tmp_mean}, {*mean}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_mean.Run(stream); @@ -183,7 +183,7 @@ class LayerNormNPUKernel : public framework::OpKernel { if (x->type() == framework::proto::VarType::FP16 && variance->type() == framework::proto::VarType::FP32) { auto dst_dtype = ConvertToNpuDtype(variance->type()); - auto runner_cast_variance = + const auto& runner_cast_variance = NpuOpRunner("Cast", {*tmp_variance}, {*variance}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_variance.Run(stream); @@ -250,7 +250,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel { Tensor value(x->type()); value.mutable_data({1}, place); FillNpuTensorWithConstant(&value, static_cast(1.0)); - auto runner = + const auto& runner = NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); runner.Run(stream); scale = &default_scale; @@ -265,7 +265,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel { cast_scale.Resize(scale->dims()); cast_scale.mutable_data(ctx.GetPlace()); auto dst_dtype = ConvertToNpuDtype(x->type()); - auto runner_cast_scale = + const auto& runner_cast_scale = NpuOpRunner("Cast", {*scale}, {cast_scale}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_scale.Run(stream); @@ -280,7 +280,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel { cast_mean.Resize(mean->dims()); cast_mean.mutable_data(ctx.GetPlace()); auto dst_dtype = ConvertToNpuDtype(x->type()); - auto runner_cast_mean = + const auto& runner_cast_mean = NpuOpRunner("Cast", {*mean}, {cast_mean}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_mean.Run(stream); @@ -295,7 +295,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel { cast_variance.Resize(variance->dims()); cast_variance.mutable_data(ctx.GetPlace()); auto dst_dtype = ConvertToNpuDtype(x->type()); - auto runner_cast_variance = + const auto& runner_cast_variance = NpuOpRunner("Cast", {*variance}, {cast_variance}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_variance.Run(stream); @@ -343,16 +343,16 @@ class LayerNormGradNPUKernel : public framework::OpKernel { dbias->mutable_data(ctx.GetPlace()); } - auto runner = NpuOpRunner("LayerNormGrad", - {*dy, *x, cast_variance, cast_mean, cast_scale}, - {*dx, *tmp_dscale, *tmp_dbias}, {}); + const auto& runner = NpuOpRunner( + "LayerNormGrad", {*dy, *x, cast_variance, cast_mean, cast_scale}, + {*dx, *tmp_dscale, *tmp_dbias}, {}); runner.Run(stream); // cast back from FP16 to FP32 if (x->type() == framework::proto::VarType::FP16 && dscale->type() == framework::proto::VarType::FP32) { auto dst_dtype = ConvertToNpuDtype(dscale->type()); - auto runner_cast_dscale = + const auto& runner_cast_dscale = NpuOpRunner("Cast", {*tmp_dscale}, {*dscale}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_dscale.Run(stream); @@ -361,7 +361,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel { if (x->type() == framework::proto::VarType::FP16 && dbias->type() == framework::proto::VarType::FP32) { auto dst_dtype = ConvertToNpuDtype(dbias->type()); - auto runner_cast_dbias = + const auto& runner_cast_dbias = NpuOpRunner("Cast", {*tmp_dbias}, {*dbias}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast_dbias.Run(stream); diff --git a/paddle/fluid/operators/lgamma_op.cc b/paddle/fluid/operators/lgamma_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..148fb05afcfd9a4ef1fcbc587a2bd33947a41000 --- /dev/null +++ b/paddle/fluid/operators/lgamma_op.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/lgamma_op.h" + +namespace paddle { +namespace operators { + +class LgammaOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of lgamma op."); + AddOutput("Out", "(Tensor), The output tensor of lgamma op."); + AddComment(R"DOC( +Lgamma Operator. + +This operator performs elementwise lgamma for input $X$. +$$out = log\Gamma(x)$$ + +)DOC"); + } +}; + +class LgammaOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Lgamma"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Lgamma"); + + auto in_dims = ctx->GetInputDim("X"); + + ctx->SetOutputDim("Out", in_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +template +class LgammaGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("lgamma_grad"); + retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + retv->SetInput("X", this->Input("X")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +class LgammaGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@Grad", "LgammaGrad"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "LgammaGrad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + "X@Grad", "LgammaGrad"); + + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), dout_dims); + ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(lgamma, ops::LgammaOp, ops::LgammaOpMaker, + ops::LgammaGradMaker, + ops::LgammaGradMaker); + +REGISTER_OPERATOR(lgamma_grad, ops::LgammaGradOp); + +REGISTER_OP_CPU_KERNEL( + lgamma, ops::LgammaKernel, + ops::LgammaKernel) + +REGISTER_OP_CPU_KERNEL( + lgamma_grad, + ops::LgammaGradKernel, + ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.cu b/paddle/fluid/operators/lgamma_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..befd31e3bd8b1898ad6c59dca80dac3ae6de339d --- /dev/null +++ b/paddle/fluid/operators/lgamma_op.cu @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" +#include "paddle/fluid/operators/lgamma_op.h" +#include "paddle/fluid/operators/math/complex_functors.h" + +namespace paddle { +namespace operators { + +template +struct CudaLgammaFunctor; + +template +struct CudaLgammaFunctor>> { + __device__ __forceinline__ T operator()(const T* args) const { + return Eigen::numext::lgamma(args[0]); + } +}; + +template +class LgammaKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + Tensor* out = context.Output("Out"); + out->mutable_data>(context.GetPlace()); + + auto& dev_ctx = context.device_context(); + std::vector ins = {x}; + std::vector outs = {out}; + auto functor = CudaLgammaFunctor(); + LaunchSameDimsElementwiseCudaKernel>(dev_ctx, ins, &outs, + functor); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + lgamma, ops::LgammaKernel, + ops::LgammaKernel); + +REGISTER_OP_CUDA_KERNEL( + lgamma_grad, + ops::LgammaGradKernel, + ops::LgammaGradKernel); diff --git a/paddle/fluid/operators/lgamma_op.h b/paddle/fluid/operators/lgamma_op.h new file mode 100644 index 0000000000000000000000000000000000000000..674054e74573208ea9bbd537419d202e1a30d8c0 --- /dev/null +++ b/paddle/fluid/operators/lgamma_op.h @@ -0,0 +1,100 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +struct LgammaFunctor { + LgammaFunctor(const T* input, T* output, int64_t numel) + : input_(input), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = Eigen::numext::lgamma(input_[idx]); + } + + private: + const T* input_; + T* output_; + int64_t numel_; +}; + +template +struct LgammaGradFunctor { + LgammaGradFunctor(const T* dout, const T* x, T* output, int64_t numel) + : dout_(dout), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + output_[idx] = dout_[idx] * Eigen::numext::digamma(x_[idx]); + } + + private: + const T* dout_; + const T* x_; + T* output_; + int64_t numel_; +}; + +using Tensor = framework::Tensor; + +template +class LgammaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + Tensor* out = context.Output("Out"); + + auto numel = x->numel(); + auto* x_data = x->data(); + auto* out_data = out->mutable_data(context.GetPlace(), + size_t(x->numel() * sizeof(T))); + + auto& dev_ctx = context.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + LgammaFunctor functor(x_data, out_data, numel); + for_range(functor); + } +}; + +template +class LgammaGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + const framework::Tensor* d_out = + ctx.Input(framework::GradVarName("Out")); + const framework::Tensor* x = ctx.Input("X"); + framework::Tensor* d_x = + ctx.Output(framework::GradVarName("X")); + + auto numel = d_out->numel(); + auto* dout_data = d_out->data(); + auto* x_data = x->data(); + auto* dx_data = d_x->mutable_data( + ctx.GetPlace(), static_cast(numel * sizeof(T))); + + auto& dev_ctx = ctx.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + LgammaGradFunctor functor(dout_data, x_data, dx_data, numel); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 63d3f809f263588bc1fbcd9ee4305e2ce9321e38..374bfa73f21870ae630043983466601920b53f6f 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -87,6 +87,8 @@ REGISTER_OP_CPU_KERNEL( load_combine, ops::LoadCombineOpKernel, ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, ops::LoadCombineOpKernel, ops::LoadCombineOpKernel, ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 4f2c9a6ca038cff7188793f42417baf7e096ee50..ba19aee9b8d7621703cfe0ac7da24d5bde2b5339 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -69,6 +69,8 @@ REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker); REGISTER_OP_CPU_KERNEL( load, ops::LoadOpKernel, ops::LoadOpKernel, + ops::LoadOpKernel, ops::LoadOpKernel, ops::LoadOpKernel, ops::LoadOpKernel); diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index 1569512dc74f7209a4dd3921e275c02e40745535..c41805d41cef4618a3f355e04f8e156423f91b55 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -154,3 +154,8 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( log_loss_grad, ops::LogLossGradKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/fluid/operators/log_loss_op.h b/paddle/fluid/operators/log_loss_op.h index e62de17a98603109786e49725537867c3fe7831a..e7985ab810b138da62390fae29eb4a6cf638c897 100644 --- a/paddle/fluid/operators/log_loss_op.h +++ b/paddle/fluid/operators/log_loss_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -40,9 +41,8 @@ class LogLossKernel : public framework::OpKernel { auto loss = EigenVector::Flatten(*loss_out); auto& place = *ctx.template device_context().eigen_device(); - loss.device(place) = (-(label * (prediction + epsilon).log()) - - ((static_cast(1) - label) * - (static_cast(1) - prediction + epsilon).log())); + EigenLogLoss, T>::Eval( + place, loss, prediction, label, epsilon); } }; @@ -64,9 +64,8 @@ class LogLossGradKernel : public framework::OpKernel { if (dpred) { dpred->mutable_data(ctx.GetPlace()); auto dx = framework::EigenVector::Flatten(*dpred); - dx.device(place) = dl * (-(label / (prediction + epsilon)) + - ((static_cast(1) - label) / - (static_cast(1) - prediction + epsilon))); + EigenLogLossGrad, T>::Eval( + place, dx, dl, prediction, label, epsilon); } } }; diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index e4fe92c625640dba38daa6690705eed2cf0032be..7c47ad90502ebd1f1aa0524110c501f38034b936 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -15,6 +15,7 @@ #include #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/log_softmax_op.h" +#include "paddle/fluid/operators/math/functors.h" #include "paddle/fluid/platform/cuda_device_function.h" namespace paddle { @@ -104,7 +105,7 @@ __global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src, #pragma unroll for (int it = 0; it < warp_iter; ++it) { int element_index = thread_in_warp_idx + it * kernel_warp_size; - if (element_index < element_count) { + if (element_index < effective_element_count) { dst[batch_id * element_count + element_index] = static_cast(elements[it] - max_value - sum); } else { @@ -142,6 +143,170 @@ void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size, } } +// Returns the final item after reduce operation along block.x. +// Firstly, get shared memory(smem) offset, find the starting position for every +// y. +// Secondly, initialise every smem position with value 'val' of thread itself. +// Thirdly, apply standard reduction along x direction as below: +// +// -> x direction +// [o o o o o o o o] time 0 +// | |/ / +// | /| / +// | / | / +// |/ |/ +// [o o o o x x x x] time 1 +// | |/ / +// |/|/ +// [o o x x x x x x] time 2 +// |/ +// [o x x x x x x x] time 3 +// +// Finally, return the first item. +// Imaging multiple reductions executed in paralell along y axis, +// Note that when blockDim.x is not 1, it's a EVEN number in all cases, +// and the size of shared memory is even as well. +template class Functor> +__forceinline__ __device__ T BlockReduceAlongDimX(T *shared, T val) { + Functor func; + // This reduction is not Block-wise reduction, only reduce along block.x. + // therefore the shared mem has offsets for different block.y. + shared += threadIdx.y * blockDim.x; + shared[threadIdx.x] = val; + int offset = blockDim.x / 2; + + while (offset > 0) { + __syncthreads(); + if (threadIdx.x < offset) { + shared[threadIdx.x] = + func(shared[threadIdx.x], shared[threadIdx.x + offset]); + } + offset /= 2; + } + __syncthreads(); + return shared[0]; +} + +template +__global__ void LogSoftmaxForwardCUDAKernelNotLastAxis( + T *output, const T *input, int outer_size, int dim_size, int inner_size) { + extern __shared__ unsigned char smem[]; + auto sdata = reinterpret_cast(smem); + + const int outer_stride = inner_size * dim_size; + const int dim_stride = inner_size; + + for (int x_id = blockIdx.x; x_id < outer_size; x_id += gridDim.x) { + for (int y_id = blockIdx.y * blockDim.y + threadIdx.y; y_id < inner_size; + y_id += blockDim.y * gridDim.y) { + const int data_offset = x_id * outer_stride + y_id; + // When blockDim.x==1, no block.x-reduction opetaions are needed. + // And threadIdx.x is 0 all the time, so the for-loops below are literally + // loops (No parallel executions). Loop all elements along axis and + // calculate the Max, Sum and (input[id]-Max-log(Sum)) to get the final + // log_softmax values along that axis. + // 1. reduce max + AccT max_value = -std::numeric_limits::infinity(); + // For one thread, iterate all items it responsable for, and get + // max_value. + // If there are N threads, N max_value will be returned. + for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { + const AccT value = + static_cast(input[data_offset + d * dim_stride]); + max_value = math::MaxFunctor()(max_value, value); + } + // If there are more than 1 threads along block x, reduce all max_values + // and get the global max_value, which is the max value along "axis". + // If there is only one thread along block x, no need to reduce, as the + // 'max_value' is the global max_value. + if (blockDim.x > 1) { + max_value = + BlockReduceAlongDimX(sdata, max_value); + } + + // 2. reduce sum + AccT sum = 0; + // Below is the same execution as '1. reduce max' + for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { + sum += std::exp(static_cast(input[data_offset + d * dim_stride]) - + max_value); + } + if (blockDim.x > 1) { + sum = BlockReduceAlongDimX(sdata, sum); + } + + // 3. input-max-log_sum and write to output + for (int d = threadIdx.x; d < dim_size; d += blockDim.x) { + output[data_offset + d * dim_stride] = static_cast( + static_cast(input[data_offset + d * dim_stride]) - max_value - + std::log(sum)); + } + } + } +} + +// block.y covers inner_size. Threads along the x axis process dim_size +// elements, and make sure not to exceed the 1024 threads per block. +// Note that dim_threads namely blockDim.x is either 1 or a even number. +inline dim3 GetBlockSize(int dim_size, int inner_size) { + int inner_threads = inner_size; + inner_threads = std::min(inner_threads, 1024); + int dim_threads = 1; + + while (dim_threads * inner_threads <= 1024 && dim_threads <= dim_size) { + dim_threads *= 2; + } + dim_threads /= 2; + return dim3(dim_threads, inner_threads); +} + +// First cover the y axis as many blocks as possible. +// Then cover the x axis as many blocks as possible, +// and make sure not to exceed the max_active_blocks. +inline dim3 GetGridSize(dim3 block, int max_active_blocks, int outer_size, + int dim_size, int inner_size) { + int inner_blocks = (inner_size + block.y - 1) / block.y; + if (inner_blocks > max_active_blocks) inner_blocks = max_active_blocks; + + int outer_blocks = (max_active_blocks + inner_blocks - 1) / inner_blocks; + if (outer_blocks > outer_size) outer_blocks = outer_size; + return dim3(outer_blocks, inner_blocks); +} + +// When designing grid size and block size, priority is given to block size, +// and grid will be determined according to the maximum number of active blocks, +// which is set by as a experience value. +template +void ComputeLaunchConfigure(Kernel k, int outer_size, int dim_size, + int inner_size, dim3 &grid, dim3 &block, + int &shared_mem, int num_sm) { + block = GetBlockSize(dim_size, inner_size); + int block_threads = block.x * block.y; + shared_mem = block.x == 1 ? 0 : block_threads * sizeof(T); + int max_active_blocks = num_sm * 2; + grid = + GetGridSize(block, max_active_blocks, outer_size, dim_size, inner_size); +} + +template +void LaunchLogSoftmaxForwardCUDAKernelNotLastAxis(T *output_data, + const T *input_data, + int outer_size, int dim_size, + int inner_size, int num_sm, + gpuStream_t stream) { + int shared_mem; + dim3 grid; + dim3 block; + + ComputeLaunchConfigure( + &LogSoftmaxForwardCUDAKernelNotLastAxis, outer_size, dim_size, + inner_size, grid, block, shared_mem, num_sm); + + LogSoftmaxForwardCUDAKernelNotLastAxis< + T, MPDType><<>>( + output_data, input_data, outer_size, dim_size, inner_size); +} + template class LogSoftmaxKernel : public framework::OpKernel { @@ -164,14 +329,15 @@ class LogSoftmaxKernel } int outer_size = SizeToAxis(axis, x->dims()); gpuStream_t stream = context.cuda_device_context().stream(); + int num_sm = context.cuda_device_context().GetSMCount(); if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { LaunchSoftmaxForwardForLastAxis(output_data, input_data, dim_size, outer_size, stream); } else { - LogSoftmaxFunctor()( - context.template device_context(), x, - out, axis); + LaunchLogSoftmaxForwardCUDAKernelNotLastAxis( + output_data, input_data, outer_size, dim_size, inner_size, num_sm, + stream); } } }; @@ -195,7 +361,7 @@ __global__ void ComputeLogSoftmaxBackwardInWarp(const T *output, constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; int batch_id = blockDim.y * blockIdx.x + threadIdx.y; - int thread_in_warp_idx = threadIdx.x % kernel_warp_size; + int thread_in_warp_idx = threadIdx.x; // 1.read data from global memory to registers AccT output_register[warp_iter]; @@ -209,8 +375,8 @@ __global__ void ComputeLogSoftmaxBackwardInWarp(const T *output, grad_output_register[iter] = static_cast( grad_output[batch_id * element_count + element_index]); } else { - output_register[iter] = AccT(0); - grad_output_register[iter] = AccT(0); + output_register[iter] = static_cast(0); + grad_output_register[iter] = static_cast(0); } } @@ -226,7 +392,7 @@ __global__ void ComputeLogSoftmaxBackwardInWarp(const T *output, #pragma unroll for (int iter = 0; iter < warp_iter; ++iter) { int element_index = thread_in_warp_idx + iter * kernel_warp_size; - if (element_index < element_count) { + if (element_index < effective_element_count) { grad_input[batch_id * element_count + element_index] = static_cast( (grad_output_register[iter] - std::exp(output_register[iter]) * sum)); } @@ -271,13 +437,13 @@ class LogSoftmaxGradKernel public: void Compute(const framework::ExecutionContext &context) const override { const auto *out = context.Input("Out"); - const auto *g_out = + const auto *d_out = context.Input(framework::GradVarName("Out")); - auto *g_x = context.Output(framework::GradVarName("X")); + auto *d_x = context.Output(framework::GradVarName("X")); const auto *out_data = out->data(); - const auto *g_out_data = g_out->data(); - auto *g_x_data = g_x->mutable_data(context.GetPlace()); + const auto *d_out_data = d_out->data(); + auto *d_x_data = d_x->mutable_data(context.GetPlace()); const int rank = out->dims().size(); const int axis = CanonicalAxis(context.Attr("axis"), rank); @@ -292,11 +458,11 @@ class LogSoftmaxGradKernel if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { LaunchSoftmaxBackwardForLastAxis( - g_x_data, g_out_data, out_data, dim_size, outer_size, stream); + d_x_data, d_out_data, out_data, dim_size, outer_size, stream); } else { LogSoftmaxGradFunctor()( context.template device_context(), out, - g_out, g_x, axis); + d_out, d_x, axis); } } }; diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 2e8b551ea4e43ce4dd919b6800b9b3784b4a7aac..9a0ce3900acf1c104233aeffb2746c8b4e6f8595 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -118,6 +118,11 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ") for entry attribute.") .SetDefault("none"); + AddAttr("table_class", + "(std::string, default " + ") for table_class.") + .SetDefault("none"); + AddAttr>( "table_names", "(string vector, the split table names that will be fetched from " diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc index feaa33e28dfc54cdfac9d55e22b3bdfcf4c587e5..f1bb9a985f4c1da262202a98b15847d85ef8e305 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cc +++ b/paddle/fluid/operators/lookup_table_v2_op.cc @@ -197,10 +197,12 @@ REGISTER_OPERATOR(lookup_table_v2_grad, ops::LookupTableV2OpGrad, ops::LookupTableV2OpGradVarTypeInference); REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel, - ops::LookupTableV2Kernel); -REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad, - ops::LookupTableV2GradKernel, - ops::LookupTableV2GradKernel); + ops::LookupTableV2Kernel, + ops::LookupTableV2Kernel); +REGISTER_OP_CPU_KERNEL( + lookup_table_v2_grad, ops::LookupTableV2GradKernel, + ops::LookupTableV2GradKernel, + ops::LookupTableV2GradKernel); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(lookup_table_v2) diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h index 877baebdb6a1aacabe953ea40a7849c01c608081..4e8d96afa03c4a6dc68f762d6274acee5f0c0dd0 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.h +++ b/paddle/fluid/operators/lookup_table_v2_op.h @@ -91,8 +91,8 @@ class LookupTableV2Kernel : public framework::OpKernel { int64_t row_width = table_t.value().dims()[1]; const auto *table = table_t.value().data(); auto *output = output_t->mutable_data(context.GetPlace()); + auto input_data_type = table_t.value().type(); - auto blas = math::GetBlas(context); for (int64_t i = 0; i < ids_numel; ++i) { if (padding_idx != kNoPadding && ids[i] == padding_idx) { memset(output + i * row_width, 0, row_width * sizeof(T)); @@ -109,8 +109,15 @@ class LookupTableV2Kernel : public framework::OpKernel { platform::errors::InvalidArgument( "the input key should be exists. But received %d.", id_index)); - blas.VCOPY(row_width, table + id_index * row_width, - output + i * row_width); + + if (input_data_type == framework::proto::VarType::BF16) { + memcpy(output + i * row_width, table + id_index * row_width, + row_width * sizeof(T)); + } else { + auto blas = math::GetBlas(context); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); + } } } } diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 87618b954d232dcfe5d0ed0b8062db7c324c1290..2a8f47462345188c3870ca07119fe7687a1ebe9f 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -39,14 +39,14 @@ class LookupTableV2NPUKernel : public framework::OpKernel { table_var->IsType(), true, platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); - framework::NPUAttributeMap attr_input = {{"validate_indices", false}}; - auto runner = - NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + NpuOpRunner runner; + runner.SetType("GatherV2") + .AddInput(*table_t) + .AddInput(*ids_t) + .AddInput(std::vector{0}) + .AddOutput(*output_t); + runner.Run(); } }; @@ -65,17 +65,31 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - auto runner_zeros = - NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); - runner_zeros.Run(stream); - - // NOTE(zhiqiu): It seems in cann 20.1, the first input and output - // can be different tensor, but in cann 20.2+, it does inplace operation. - // Thus, the first input and output should be same tensor. - auto runner_scatter = - NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, - {*table_grad_t}, {{"use_locking", true}}); - runner_scatter.Run(stream); + int embedding_dim = table_grad_t->dims()[1]; + + if (embedding_dim % 32 == 0) { + // NOTE(pangyoki): The embedding_dim of Tensor used in + // EmbeddingDenseGrad must be an integer multiple of 32. + int num_weights = table_grad_t->dims()[0]; + const auto &runner = + NpuOpRunner("EmbeddingDenseGrad", {*output_grad_t, *ids_t}, + {*table_grad_t}, {{"num_weights", num_weights}, + {"padding_idx", -1}, + {"scale_grad_by_freq", false}}); + runner.Run(stream); + } else { + const auto &runner_zeros = + NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); + runner_zeros.Run(stream); + + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. + const auto &runner_scatter = + NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, + {*table_grad_t}, {{"use_locking", true}}); + runner_scatter.Run(stream); + } } }; } // namespace operators diff --git a/paddle/fluid/operators/marker_op.cc b/paddle/fluid/operators/marker_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..397e3bfc6ad262d83f46f6751dd9372fbb20efcd --- /dev/null +++ b/paddle/fluid/operators/marker_op.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { + +class MarkerOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + std::string marker_role = ctx->Attrs().Get("marker_role"); + std::string marker_pos = ctx->Attrs().Get("marker_pos"); + + VLOG(3) << "The role is:" << marker_role << ";" + << "The position is:" << marker_pos << "."; + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } +}; + +class MarkerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddAttr("marker_role", + "(string, default forward)forward or backward," + " mark different stages of porcess.") + .SetDefault("forward"); + AddAttr( + "marker_pos", + "(string, default B)the posititon where the marker is placed, " + "B stands for begin of duration," + " E stands for end of duration.") + .SetDefault("B"); + AddComment( + R"DOC(Marker Operator - Add marker at the beginning/end of a forward/backward process.)DOC"); + } +}; + +template +class MarkerOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto marker_role = ctx.Attr("marker_role"); + auto marker_pos = ctx.Attr("marker_pos"); + + platform::RecordEvent record_event( + "MarkerCPU", platform::EventRole::kInnerOp, + "marker_" + marker_role + "_" + marker_pos); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(marker, ops::MarkerOp, ops::MarkerOpMaker); +REGISTER_OP_CPU_KERNEL(marker, ops::MarkerOpCPUKernel); diff --git a/paddle/fluid/operators/marker_op.cu b/paddle/fluid/operators/marker_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..b918210389169ab2f85f1a8bcd244e59a480281a --- /dev/null +++ b/paddle/fluid/operators/marker_op.cu @@ -0,0 +1,61 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { + +template +__global__ void SimpleMarkerKernel(T* in, T* out, int ndim) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + for (; idx < ndim; idx += blockDim.x * gridDim.x) { + out[idx] = in[idx]; + } +} + +template +class MarkerOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + + auto marker_role = ctx.Attr("marker_role"); + auto marker_pos = ctx.Attr("marker_pos"); + VLOG(3) << "marker role: " << marker_role + << " marker position: " << marker_pos; + + framework::Tensor A; + framework::Tensor B; + auto* in_temp = A.mutable_data({32, 1}, ctx.GetPlace()); + auto* out_temp = B.mutable_data({32, 1}, ctx.GetPlace()); + platform::RecordEvent record_event( + "MarkerCUDA", platform::EventRole::kInnerOp, + "marker_" + marker_role + "_" + marker_pos); + SimpleMarkerKernel<<<1, 32, 0, dev_ctx.stream()>>>(in_temp, out_temp, + 32); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(marker, ops::MarkerOpCUDAKernel); diff --git a/paddle/fluid/operators/masked_select_op.cc b/paddle/fluid/operators/masked_select_op.cc index 3b44c02757fae9648a7e660a06c03af45d621e02..17bf5df18adc543ea487160a31d05d3c802b95a7 100644 --- a/paddle/fluid/operators/masked_select_op.cc +++ b/paddle/fluid/operators/masked_select_op.cc @@ -26,8 +26,9 @@ class MaskedSelectOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "Input", "MaskedSelect"); OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "MaskedSelect"); OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Out", "MaskedSelect"); - framework::DDim output_dims(ctx->GetInputDim("X")); - ctx->SetOutputDim("Y", output_dims); + + // output will only be a 1-D Tensor + ctx->SetOutputDim("Y", framework::make_ddim({-1})); ctx->ShareLoD("X", /*->*/ "Y"); } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index fdbc0c68525baeef6d9af66917e8499fbfd1a02f..a13fffe15cf2405fadd9e1c09a962748c66e255f 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -56,7 +56,13 @@ cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) math_library(maxouting) math_library(pooling) -math_library(selected_rows_functor DEPS selected_rows math_function blas) + +if(WITH_MKLDNN) + math_library(selected_rows_functor DEPS selected_rows math_function blas mkldnn_axpy_handler) +else() + math_library(selected_rows_functor DEPS selected_rows math_function blas) +endif() + math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 512f9c62415e5d1b09a1b649e78c72ac2d9f2d88..4d7218cd89e04b5122ff4385abfb2c7305e40c0a 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" @@ -311,6 +312,156 @@ __global__ void SoftmaxKernelWithEltadd2( #endif } +template +__global__ void SoftmaxKernelWithEltaddForLarge(T *qk_buf, const T *bias_qk, + const int batch_size, + const int head_num, + const int seq_len, + const unsigned mask) { + int qk_offset = blockIdx.x * seq_len; + assert(blockDim.x % 32 == 0); + + T stride_max = -1e20f; + for (int i = 0; i < seq_len; i += blockDim.x) { + stride_max = qk_buf[threadIdx.x + i + qk_offset] + + bias_qk[threadIdx.x + i + qk_offset] > + stride_max + ? qk_buf[threadIdx.x + i + qk_offset] + + bias_qk[threadIdx.x + i + qk_offset] + : stride_max; + } + T max_val = blockReduceMax(stride_max, mask); + + T stride_sum = 0.f; + for (int i = 0; i < seq_len; i += blockDim.x) { + stride_sum += __expf(qk_buf[threadIdx.x + i + qk_offset] + + bias_qk[threadIdx.x + i + qk_offset] - max_val); + } + T sum_val = blockReduceSum(stride_sum, mask); + + for (int i = 0; i < seq_len; i += blockDim.x) { + qk_buf[threadIdx.x + i + qk_offset] = + (T)(__expf(qk_buf[threadIdx.x + i + qk_offset] + + bias_qk[threadIdx.x + i + qk_offset] - max_val) / + sum_val); + } +} + +// HIP defined __HIP_NO_HALF_CONVERSIONS__ +#ifndef __HIPCC__ // @{ Half kernel: SoftmaxKernelWithEltadd +template <> +__global__ void SoftmaxKernelWithEltaddForLarge( + half *qk_buf, const half *bias_qk, const int batch_size, const int head_num, + const int seq_len, const unsigned mask) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + int qk_offset = blockIdx.x * seq_len; + assert(blockDim.x % 32 == 0); + + float stride_max = -1e20f; + for (int i = 0; i < seq_len; i += blockDim.x) { + float tmp = static_cast(qk_buf[threadIdx.x + i + qk_offset] + + bias_qk[threadIdx.x + i + qk_offset]); + stride_max = tmp > stride_max ? tmp : stride_max; + } + float max_val = blockReduceMax(stride_max, mask); + + float stride_sum = 0.f; + for (int i = 0; i < seq_len; i += blockDim.x) { + float tmp = static_cast(qk_buf[threadIdx.x + i + qk_offset] + + bias_qk[threadIdx.x + i + qk_offset]); + stride_sum += __expf(tmp - max_val); + } + float sum_val = blockReduceSum(stride_sum, mask); + + for (int i = 0; i < seq_len; i += blockDim.x) { + float tmp = + __expf(static_cast(qk_buf[threadIdx.x + i + qk_offset] + + bias_qk[threadIdx.x + i + qk_offset]) - + max_val); + qk_buf[threadIdx.x + i + qk_offset] = (half)(tmp / sum_val); + } +#endif +} +#endif // @} End Half kernel: SoftmaxKernelWithEltadd + +template +__global__ void SoftmaxKernelWithEltaddForLarge2(T *qk_buf_, const T *bias_qk_, + const int batch_size, + const int head_num, + const int seq_len, + const unsigned mask) { + int qk_offset = blockIdx.x * seq_len; + assert(blockDim.x % 32 == 0); + + float2 stride_max = make_float2(-1e20f, -1e20f); + for (int i = 0; i < seq_len; i += blockDim.x) { + float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + stride_max.x = max(stride_max.x, cur.x); + stride_max.y = max(stride_max.y, cur.y); + } + float max_val = blockReduceMax(max(stride_max.x, stride_max.y), mask); + + float2 stride_sum = make_float2(0.f, 0.f); + for (int i = 0; i < seq_len; i += blockDim.x) { + float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + stride_sum.x += __expf(cur.x - max_val); + stride_sum.y += __expf(cur.y - max_val); + } + + float sum_val = + blockReduceSum(stride_sum.x + stride_sum.y, mask) + 1e-6f; + + for (int i = 0; i < seq_len; i += blockDim.x) { + float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair( + __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val); + } +} + +template <> +__global__ void SoftmaxKernelWithEltaddForLarge2( + half2 *qk_buf_, const half2 *bias_qk_, const int batch_size, + const int head_num, const int seq_len, const unsigned mask) { +// operator "+" of half only suppotted after cuda version 10.0 +// HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake +#if defined(PADDLE_WITH_CUDA) && \ + (CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) && CUDA_VERSION >= 10000) + + int qk_offset = blockIdx.x * seq_len; + assert(blockDim.x % 32 == 0); + + float2 stride_max = make_float2(-1e20f, -1e20f); + for (int i = 0; i < seq_len; i += blockDim.x) { + float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + stride_max.x = max(stride_max.x, cur.x); + stride_max.y = max(stride_max.y, cur.y); + } + float max_val = blockReduceMax(max(stride_max.x, stride_max.y), mask); + + float2 stride_sum = make_float2(0.f, 0.f); + for (int i = 0; i < seq_len; i += blockDim.x) { + float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + stride_sum.x += __expf(cur.x - max_val); + stride_sum.y += __expf(cur.y - max_val); + } + + float sum_val = + blockReduceSum(stride_sum.x + stride_sum.y, mask) + 1e-6f; + + for (int i = 0; i < seq_len; i += blockDim.x) { + float2 cur = ToFloat2(qk_buf_[threadIdx.x + i + qk_offset] + + bias_qk_[threadIdx.x + i + qk_offset]); + qk_buf_[threadIdx.x + i + qk_offset] = FloatsToPair( + __expf(cur.x - max_val) / sum_val, __expf(cur.y - max_val) / sum_val); + } +#endif +} + template inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, int head_num, int seq_len, int size_per_head, @@ -332,31 +483,48 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, reinterpret_cast(qk_buf_), batch_size * head_num, seq_len * size_per_head, seq_len * size_per_head); - int grid = batch_size * head_num * seq_len; - int block = seq_len; - - // Align block to 32, also limit seq_len to max block size. - PADDLE_ENFORCE_LE(seq_len, 1024, platform::errors::InvalidArgument( - "seq_len should <= 1024, " - "but received seq_len is:%d", - seq_len)); - if (seq_len % 2 == 0) { - block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32; - if (std::is_same::value) { - SoftmaxKernelWithEltadd2<<>>( - reinterpret_cast(qk_buf_), - reinterpret_cast(bias_qk), batch_size, head_num, - seq_len / 2, FINAL_MASK); + if (seq_len <= 1024) { + int grid = batch_size * head_num * seq_len; + int block = seq_len; + + // Align block to 32, also limit seq_len to max block size. + if (seq_len % 2 == 0) { + block = (seq_len <= 64) ? 32 : ((seq_len + 63) / 64) * 32; + if (std::is_same::value) { + SoftmaxKernelWithEltadd2<<>>( + reinterpret_cast(qk_buf_), + reinterpret_cast(bias_qk), batch_size, head_num, + seq_len / 2, FINAL_MASK); + } else { + SoftmaxKernelWithEltadd2<__half2><<>>( + reinterpret_cast<__half2 *>(qk_buf_), + reinterpret_cast(bias_qk), batch_size, head_num, + seq_len / 2, FINAL_MASK); + } } else { - SoftmaxKernelWithEltadd2<__half2><<>>( - reinterpret_cast<__half2 *>(qk_buf_), - reinterpret_cast(bias_qk), batch_size, head_num, - seq_len / 2, FINAL_MASK); + block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32; + SoftmaxKernelWithEltadd<<>>( + qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK); } } else { - block = (seq_len <= 32) ? 32 : ((seq_len + 31) / 32) * 32; - SoftmaxKernelWithEltadd<<>>( - qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK); + int grid = batch_size * head_num * seq_len; + int block = 512; + if (seq_len % 2 == 0) { + if (std::is_same::value) { + SoftmaxKernelWithEltaddForLarge2<<>>( + reinterpret_cast(qk_buf_), + reinterpret_cast(bias_qk), batch_size, head_num, + seq_len / 2, FINAL_MASK); + } else { + SoftmaxKernelWithEltaddForLarge2<__half2><<>>( + reinterpret_cast<__half2 *>(qk_buf_), + reinterpret_cast(bias_qk), batch_size, head_num, + seq_len / 2, FINAL_MASK); + } + } else { + SoftmaxKernelWithEltaddForLarge<<>>( + qk_buf_, bias_qk, batch_size, head_num, seq_len, FINAL_MASK); + } } } diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index c44c15adb13caf9be401c3174e68e229d1eea745..477f3e0f6a2dc5cfd6fcc0b0624f8f0c2563fe8b 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -260,13 +260,13 @@ struct CUBlas { }; template <> -struct CUBlas { - using complex64 = platform::complex64; - +struct CUBlas> { static void GEMV(cublasHandle_t handle, cublasOperation_t transa, int m, - int n, const complex64 *alpha, const complex64 *A, int lda, - const complex64 *B, int ldb, const complex64 *beta, - complex64 *C, int ldc) { + int n, const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemv( handle, transa, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -275,9 +275,10 @@ struct CUBlas { reinterpret_cast(C), ldc)); } - static void AXPY(cublasHandle_t handle, int n, const complex64 *alpha, - const complex64 *X, const int incX, complex64 *Y, - const int incY) { + static void AXPY(cublasHandle_t handle, int n, + const platform::complex *alpha, + const platform::complex *X, const int incX, + platform::complex *Y, const int incY) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, @@ -287,11 +288,13 @@ struct CUBlas { static void GEMM_STRIDED_BATCH(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const complex64 *alpha, const complex64 *A, - int lda, long long int strideA, // NOLINT - const complex64 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const complex64 *beta, complex64 *C, int ldc, + const platform::complex *alpha, + const platform::complex *A, int lda, + long long int strideA, // NOLINT + const platform::complex *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const platform::complex *beta, + platform::complex *C, int ldc, long long int strideC, // NOLINT int batchCount) { #if CUDA_VERSION >= 8000 @@ -310,9 +313,11 @@ struct CUBlas { static void GEMM(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const complex64 *alpha, const complex64 *A, int lda, - const complex64 *B, int ldb, const complex64 *beta, - complex64 *C, int ldc) { + const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), @@ -356,13 +361,13 @@ struct CUBlas { }; template <> -struct CUBlas { - using complex128 = platform::complex128; - +struct CUBlas> { static void GEMV(cublasHandle_t handle, cublasOperation_t transa, int m, - int n, const complex128 *alpha, const complex128 *A, int lda, - const complex128 *B, int ldb, const complex128 *beta, - complex128 *C, int ldc) { + int n, const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemv( handle, transa, m, n, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -371,9 +376,10 @@ struct CUBlas { reinterpret_cast(C), ldc)); } - static void AXPY(cublasHandle_t handle, int n, const complex128 *alpha, - const complex128 *X, const int incX, complex128 *Y, - const int incY) { + static void AXPY(cublasHandle_t handle, int n, + const platform::complex *alpha, + const platform::complex *X, const int incX, + platform::complex *Y, const int incY) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, @@ -383,11 +389,13 @@ struct CUBlas { static void GEMM_STRIDED_BATCH(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const complex128 *alpha, const complex128 *A, - int lda, long long int strideA, // NOLINT - const complex128 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const complex128 *beta, complex128 *C, int ldc, + const platform::complex *alpha, + const platform::complex *A, int lda, + long long int strideA, // NOLINT + const platform::complex *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const platform::complex *beta, + platform::complex *C, int ldc, long long int strideC, // NOLINT int batchCount) { #if CUDA_VERSION >= 8000 @@ -406,9 +414,11 @@ struct CUBlas { static void GEMM(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, - const complex128 *alpha, const complex128 *A, int lda, - const complex128 *B, int ldb, const complex128 *beta, - complex128 *C, int ldc) { + const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), @@ -535,9 +545,9 @@ template <> template <> inline void Blas::GEMM( CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, - platform::complex64 alpha, const platform::complex64 *A, - const platform::complex64 *B, platform::complex64 beta, - platform::complex64 *C) const { + platform::complex alpha, const platform::complex *A, + const platform::complex *B, platform::complex beta, + platform::complex *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -565,16 +575,16 @@ inline void Blas::GEMM( // input/output in fp16, computation in fp32, which can also be accelerated // using tensor cores in volta GPUs. auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX( + CUBlas>::GEMM_EX( &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_32F, ldb, A, CUDA_C_32F, lda, &c_beta, C, CUDA_C_32F, N, CUDA_C_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, - &c_alpha, h_B, ldb, h_A, lda, &c_beta, - h_C, N); + CUBlas>::GEMM(handle, cuTransB, cuTransA, N, M, K, + &c_alpha, h_B, ldb, h_A, lda, + &c_beta, h_C, N); }); #endif // CUDA_VERSION >= 8000 } @@ -583,9 +593,9 @@ template <> template <> inline void Blas::GEMM( CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, - platform::complex128 alpha, const platform::complex128 *A, - const platform::complex128 *B, platform::complex128 beta, - platform::complex128 *C) const { + platform::complex alpha, const platform::complex *A, + const platform::complex *B, platform::complex beta, + platform::complex *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -614,16 +624,16 @@ inline void Blas::GEMM( // input/output in fp16, computation in fp32, which can also be accelerated // using tensor cores in volta GPUs. auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX( + CUBlas>::GEMM_EX( &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, CUDA_C_64F, ldb, A, CUDA_C_64F, lda, &c_beta, C, CUDA_C_64F, N, CUDA_C_64F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, - &c_alpha, h_B, ldb, h_A, lda, &c_beta, - h_C, N); + CUBlas>::GEMM(handle, cuTransB, cuTransA, N, M, K, + &c_alpha, h_B, ldb, h_A, lda, + &c_beta, h_C, N); }); #endif // CUDA_VERSION >= 8000 } diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 64b533de098cad2b91e296fdee3da03f0e014509..eab513e24bc8090d30a42cd1149c6bf65d690839 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -15,6 +15,7 @@ #ifdef PADDLE_WITH_MKLML #include #endif + #include #include #include @@ -22,12 +23,24 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { namespace math { +namespace detail { + +template +static void axpy(int n, const T alpha, const T *x, const int incx, T *y, + const int incy) { + // Y = Y + alpha * X + while (n-- > 0) { + *y += alpha * *x; + y = y + incy; + x = x + incx; + } +} +} // namespace detail template struct CBlas; @@ -43,6 +56,11 @@ struct CBlas { template <> struct CBlas { + template + static void AXPY(ARGS... args) { + detail::axpy(args...); + } + template static void VCOPY(ARGS... args) { PADDLE_THROW(platform::errors::Unimplemented( @@ -305,11 +323,11 @@ struct CBlas { }; template <> -struct CBlas { +struct CBlas> { template - static void AXPY(int n, const paddle::platform::complex64 alpha, - const paddle::platform::complex64 *X, const int incX, - paddle::platform::complex64 *Y, const int incY) { + static void AXPY(int n, const paddle::platform::complex alpha, + const paddle::platform::complex *X, const int incX, + paddle::platform::complex *Y, const int incY) { platform::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY); } @@ -344,35 +362,35 @@ struct CBlas { */ template - static void VADD(int n, const paddle::platform::complex64 *a, - const paddle::platform::complex64 *b, - paddle::platform::complex64 *y) { + static void VADD(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] + b[i]; } } template - static void VSUB(int n, const paddle::platform::complex64 *a, - const paddle::platform::complex64 *b, - paddle::platform::complex64 *y) { + static void VSUB(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] - b[i]; } } template - static void VMUL(int n, const paddle::platform::complex64 *a, - const paddle::platform::complex64 *b, - paddle::platform::complex64 *y) { + static void VMUL(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] * b[i]; } } template - static void VDIV(int n, const paddle::platform::complex64 *a, - const paddle::platform::complex64 *b, - paddle::platform::complex64 *y) { + static void VDIV(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] / b[i]; } @@ -380,11 +398,11 @@ struct CBlas { template static void GEMV(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int M, int N, - paddle::platform::complex64 alpha, - const paddle::platform::complex64 *A, int lda, - const paddle::platform::complex64 *X, int incx, - paddle::platform::complex64 beta, - paddle::platform::complex64 *Y, int incy) { + paddle::platform::complex alpha, + const paddle::platform::complex *A, int lda, + const paddle::platform::complex *X, int incx, + paddle::platform::complex beta, + paddle::platform::complex *Y, int incy) { const void *a_ = (const void *)(A); const void *x_ = (const void *)(X); void *y_ = static_cast(Y); @@ -395,11 +413,11 @@ struct CBlas { template static void GEMM(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans_a, CBLAS_TRANSPOSE trans_b, int M, int N, int K, - paddle::platform::complex64 alpha, - const paddle::platform::complex64 *A, int lda, - const paddle::platform::complex64 *B, int ldb, - paddle::platform::complex64 beta, - paddle::platform::complex64 *C, int ldc) { + paddle::platform::complex alpha, + const paddle::platform::complex *A, int lda, + const paddle::platform::complex *B, int ldb, + paddle::platform::complex beta, + paddle::platform::complex *C, int ldc) { const void *a_ = (const void *)(A); const void *b_ = (const void *)(B); void *c_ = static_cast(C); @@ -410,11 +428,12 @@ struct CBlas { template static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a, CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K, - paddle::platform::complex64 *alpha, - const paddle::platform::complex64 **A, const int *lda, - const paddle::platform::complex64 **B, const int *ldb, - paddle::platform::complex64 *beta, - paddle::platform::complex64 **C, const int *ldc, + paddle::platform::complex *alpha, + const paddle::platform::complex **A, + const int *lda, + const paddle::platform::complex **B, + const int *ldb, paddle::platform::complex *beta, + paddle::platform::complex **C, const int *ldc, int group_count, int *group_size) { const void **A_void = (const void **)(&(*A)); const void **B_void = (const void **)(&(*B)); @@ -432,11 +451,11 @@ struct CBlas { }; template <> -struct CBlas { +struct CBlas> { template - static void AXPY(int n, const paddle::platform::complex128 alpha, - const paddle::platform::complex128 *X, const int incX, - paddle::platform::complex128 *Y, const int incY) { + static void AXPY(int n, const paddle::platform::complex alpha, + const paddle::platform::complex *X, const int incX, + paddle::platform::complex *Y, const int incY) { platform::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY); } @@ -471,35 +490,35 @@ struct CBlas { */ template - static void VADD(int n, const paddle::platform::complex128 *a, - const paddle::platform::complex128 *b, - paddle::platform::complex128 *y) { + static void VADD(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] + b[i]; } } template - static void VSUB(int n, const paddle::platform::complex128 *a, - const paddle::platform::complex128 *b, - paddle::platform::complex128 *y) { + static void VSUB(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] - b[i]; } } template - static void VMUL(int n, const paddle::platform::complex128 *a, - const paddle::platform::complex128 *b, - paddle::platform::complex128 *y) { + static void VMUL(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] * b[i]; } } template - static void VDIV(int n, const paddle::platform::complex128 *a, - const paddle::platform::complex128 *b, - paddle::platform::complex128 *y) { + static void VDIV(int n, const paddle::platform::complex *a, + const paddle::platform::complex *b, + paddle::platform::complex *y) { for (int i = 0; i < n; ++i) { y[i] = a[i] / b[i]; } @@ -507,11 +526,11 @@ struct CBlas { template static void GEMV(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans, int M, int N, - paddle::platform::complex128 alpha, - const paddle::platform::complex128 *A, int lda, - const paddle::platform::complex128 *X, int incx, - paddle::platform::complex128 beta, - paddle::platform::complex128 *Y, int incy) { + paddle::platform::complex alpha, + const paddle::platform::complex *A, int lda, + const paddle::platform::complex *X, int incx, + paddle::platform::complex beta, + paddle::platform::complex *Y, int incy) { const void *a_ = (const void *)(A); const void *x_ = (const void *)(X); void *y_ = static_cast(Y); @@ -522,11 +541,11 @@ struct CBlas { template static void GEMM(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE trans_a, CBLAS_TRANSPOSE trans_b, int M, int N, int K, - paddle::platform::complex128 alpha, - const paddle::platform::complex128 *A, int lda, - const paddle::platform::complex128 *B, int ldb, - paddle::platform::complex128 beta, - paddle::platform::complex128 *C, int ldc) { + paddle::platform::complex alpha, + const paddle::platform::complex *A, int lda, + const paddle::platform::complex *B, int ldb, + paddle::platform::complex beta, + paddle::platform::complex *C, int ldc) { const void *a_ = (const void *)(A); const void *b_ = (const void *)(B); void *c_ = static_cast(C); @@ -537,11 +556,13 @@ struct CBlas { template static void GEMM_BATCH(CBLAS_LAYOUT layout, CBLAS_TRANSPOSE *trans_a, CBLAS_TRANSPOSE *trans_b, int *M, int *N, int *K, - paddle::platform::complex128 *alpha, - const paddle::platform::complex128 **A, const int *lda, - const paddle::platform::complex128 **B, const int *ldb, - paddle::platform::complex128 *beta, - paddle::platform::complex128 **C, const int *ldc, + paddle::platform::complex *alpha, + const paddle::platform::complex **A, + const int *lda, + const paddle::platform::complex **B, + const int *ldb, + paddle::platform::complex *beta, + paddle::platform::complex **C, const int *ldc, int group_count, int *group_size) { const void **A_void = (const void **)(&(*A)); const void **B_void = (const void **)(&(*B)); @@ -617,76 +638,76 @@ struct CBlas { }; template <> -struct CBlas { +struct CBlas> { template static void VCOPY(ARGS... args) { cblas_ccopy(args...); } template - static void AXPY(int n, const paddle::platform::complex64 alpha, - const paddle::platform::complex64 *X, const int incX, - paddle::platform::complex64 *Y, const int incY) { + static void AXPY(int n, const paddle::platform::complex alpha, + const paddle::platform::complex *X, const int incX, + paddle::platform::complex *Y, const int incY) { cblas_caxpy(n, &alpha, X, incX, Y, incY); } template static void GEMV(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, const int M, const int N, - const paddle::platform::complex64 alpha, - const paddle::platform::complex64 *A, const int lda, - const paddle::platform::complex64 *X, const int incX, - const paddle::platform::complex64 beta, - paddle::platform::complex64 *Y, const int incY) { + const paddle::platform::complex alpha, + const paddle::platform::complex *A, const int lda, + const paddle::platform::complex *X, const int incX, + const paddle::platform::complex beta, + paddle::platform::complex *Y, const int incY) { cblas_cgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY); } template static void GEMM(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, - const int K, const paddle::platform::complex64 alpha, - const paddle::platform::complex64 *A, const int lda, - const paddle::platform::complex64 *B, const int ldb, - const paddle::platform::complex64 beta, - paddle::platform::complex64 *C, const int ldc) { + const int K, const paddle::platform::complex alpha, + const paddle::platform::complex *A, const int lda, + const paddle::platform::complex *B, const int ldb, + const paddle::platform::complex beta, + paddle::platform::complex *C, const int ldc) { cblas_cgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc); } }; template <> -struct CBlas { +struct CBlas> { template static void VCOPY(ARGS... args) { cblas_zcopy(args...); } template - static void AXPY(int n, const paddle::platform::complex128 alpha, - const paddle::platform::complex128 *X, const int incX, - paddle::platform::complex128 *Y, const int incY) { + static void AXPY(int n, const paddle::platform::complex alpha, + const paddle::platform::complex *X, const int incX, + paddle::platform::complex *Y, const int incY) { cblas_zaxpy(n, &alpha, X, incX, Y, incY); } template static void GEMV(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, const int M, const int N, - const paddle::platform::complex128 alpha, - const paddle::platform::complex128 *A, const int lda, - const paddle::platform::complex128 *X, const int incX, - const paddle::platform::complex128 beta, - paddle::platform::complex128 *Y, const int incY) { + const paddle::platform::complex alpha, + const paddle::platform::complex *A, const int lda, + const paddle::platform::complex *X, const int incX, + const paddle::platform::complex beta, + paddle::platform::complex *Y, const int incY) { cblas_zgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY); } template static void GEMM(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, - const int K, const paddle::platform::complex128 alpha, - const paddle::platform::complex128 *A, const int lda, - const paddle::platform::complex128 *B, const int ldb, - const paddle::platform::complex128 beta, - paddle::platform::complex128 *C, const int ldc) { + const int K, const paddle::platform::complex alpha, + const paddle::platform::complex *A, const int lda, + const paddle::platform::complex *B, const int ldb, + const paddle::platform::complex beta, + paddle::platform::complex *C, const int ldc) { cblas_zgemm(layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc); } diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h index 81110b591a1cbb3dd60a618b329b70e71b4912fe..788ebc6ad985c5fb6e6667220713783f014d2a62 100644 --- a/paddle/fluid/operators/math/blas_impl.hip.h +++ b/paddle/fluid/operators/math/blas_impl.hip.h @@ -213,13 +213,13 @@ struct CUBlas { }; template <> -struct CUBlas { - using complex64 = platform::complex64; - +struct CUBlas> { static void GEMV(rocblas_handle handle, rocblas_operation transa, int m, - int n, const complex64 *alpha, const complex64 *A, int lda, - const complex64 *B, int ldb, const complex64 *beta, - complex64 *C, int ldc) { + int n, const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemv( handle, transa, m, n, reinterpret_cast(alpha), @@ -229,9 +229,10 @@ struct CUBlas { reinterpret_cast(C), ldc)); } - static void AXPY(rocblas_handle handle, int n, const complex64 *alpha, - const complex64 *X, const int incX, complex64 *Y, - const int incY) { + static void AXPY(rocblas_handle handle, int n, + const platform::complex *alpha, + const platform::complex *X, const int incX, + platform::complex *Y, const int incY) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_caxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, @@ -241,11 +242,13 @@ struct CUBlas { static void GEMM_STRIDED_BATCH(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, int k, - const complex64 *alpha, const complex64 *A, - int lda, long long int strideA, // NOLINT - const complex64 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const complex64 *beta, complex64 *C, int ldc, + const platform::complex *alpha, + const platform::complex *A, int lda, + long long int strideA, // NOLINT + const platform::complex *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const platform::complex *beta, + platform::complex *C, int ldc, long long int strideC, // NOLINT int batchCount) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -261,9 +264,11 @@ struct CUBlas { static void GEMM(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, int k, - const complex64 *alpha, const complex64 *A, int lda, - const complex64 *B, int ldb, const complex64 *beta, - complex64 *C, int ldc) { + const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_cgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), @@ -293,13 +298,13 @@ struct CUBlas { }; template <> -struct CUBlas { - using complex128 = platform::complex128; - +struct CUBlas> { static void GEMV(rocblas_handle handle, rocblas_operation transa, int m, - int n, const complex128 *alpha, const complex128 *A, int lda, - const complex128 *B, int ldb, const complex128 *beta, - complex128 *C, int ldc) { + int n, const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemv( handle, transa, m, n, reinterpret_cast(alpha), @@ -309,9 +314,10 @@ struct CUBlas { reinterpret_cast(C), ldc)); } - static void AXPY(rocblas_handle handle, int n, const complex128 *alpha, - const complex128 *X, const int incX, complex128 *Y, - const int incY) { + static void AXPY(rocblas_handle handle, int n, + const platform::complex *alpha, + const platform::complex *X, const int incX, + platform::complex *Y, const int incY) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zaxpy( handle, n, reinterpret_cast(alpha), reinterpret_cast(X), incX, @@ -321,11 +327,13 @@ struct CUBlas { static void GEMM_STRIDED_BATCH(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, int k, - const complex128 *alpha, const complex128 *A, - int lda, long long int strideA, // NOLINT - const complex128 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const complex128 *beta, complex128 *C, int ldc, + const platform::complex *alpha, + const platform::complex *A, int lda, + long long int strideA, // NOLINT + const platform::complex *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const platform::complex *beta, + platform::complex *C, int ldc, long long int strideC, // NOLINT int batchCount) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -341,9 +349,11 @@ struct CUBlas { static void GEMM(rocblas_handle handle, rocblas_operation transa, rocblas_operation transb, int m, int n, int k, - const complex128 *alpha, const complex128 *A, int lda, - const complex128 *B, int ldb, const complex128 *beta, - complex128 *C, int ldc) { + const platform::complex *alpha, + const platform::complex *A, int lda, + const platform::complex *B, int ldb, + const platform::complex *beta, + platform::complex *C, int ldc) { PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::rocblas_zgemm( handle, transa, transb, m, n, k, reinterpret_cast(alpha), @@ -434,9 +444,9 @@ template <> template <> inline void Blas::GEMM( CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, - platform::complex64 alpha, const platform::complex64 *A, - const platform::complex64 *B, platform::complex64 beta, - platform::complex64 *C) const { + platform::complex alpha, const platform::complex *A, + const platform::complex *B, platform::complex beta, + platform::complex *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -461,7 +471,7 @@ inline void Blas::GEMM( thrust::complex c_beta = thrust::complex(beta.real, beta.imag); auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX( + CUBlas>::GEMM_EX( &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, rocblas_datatype_f32_c, ldb, A, rocblas_datatype_f32_c, lda, &c_beta, C, rocblas_datatype_f32_c, N, rocblas_datatype_f32_c); @@ -471,9 +481,9 @@ template <> template <> inline void Blas::GEMM( CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, - platform::complex128 alpha, const platform::complex128 *A, - const platform::complex128 *B, platform::complex128 beta, - platform::complex128 *C) const { + platform::complex alpha, const platform::complex *A, + const platform::complex *B, platform::complex beta, + platform::complex *C) const { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -499,7 +509,7 @@ inline void Blas::GEMM( thrust::complex(beta.real, beta.imag); auto &cuda_ctx = const_cast(context_); - CUBlas::GEMM_EX( + CUBlas>::GEMM_EX( &cuda_ctx, cuTransB, cuTransA, N, M, K, &c_alpha, B, rocblas_datatype_f64_c, ldb, A, rocblas_datatype_f64_c, lda, &c_beta, C, rocblas_datatype_f64_c, N, rocblas_datatype_f64_c); diff --git a/paddle/fluid/operators/math/complex_functors.h b/paddle/fluid/operators/math/complex_functors.h index 0e8aed40f6e16a6bd5395bdeadd49b80a132ae6f..c4bd6ec4f14a27c76e3ae9f977625f312600065b 100644 --- a/paddle/fluid/operators/math/complex_functors.h +++ b/paddle/fluid/operators/math/complex_functors.h @@ -16,8 +16,7 @@ limitations under the License. */ #include -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { @@ -65,8 +64,9 @@ using select_t = typename select::type; template using Real = - select_t::value, float>, - cond::value, double>, T>; + select_t>::value, float>, + cond>::value, double>, + T>; template using Complex = typename std::enable_if::value>::type; @@ -76,14 +76,14 @@ template using NoComplex = typename std::enable_if::value>::type; template -using EnableComplex = - typename std::enable_if::value || - std::is_same::value>::type; +using EnableComplex = typename std::enable_if< + std::is_same>::value || + std::is_same>::value>::type; template using DisableComplex = typename std::enable_if< - !std::is_same::value && - !std::is_same::value>::type; + !std::is_same>::value && + !std::is_same>::value>::type; template struct RealFunctor; @@ -173,44 +173,45 @@ struct AbsGradFunctor { }; template <> -struct AbsGradFunctor { - AbsGradFunctor(const float* dout, const paddle::platform::complex64* x, - paddle::platform::complex64* output, int64_t numel) +struct AbsGradFunctor> { + AbsGradFunctor(const float* dout, const paddle::platform::complex* x, + paddle::platform::complex* output, int64_t numel) : dout_(dout), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex64(0)) { - output_[idx] = paddle::platform::complex64(0); + if (x_[idx] == paddle::platform::complex(0)) { + output_[idx] = paddle::platform::complex(0); } else { - output_[idx] = paddle::platform::complex64(dout_[idx]) * - (x_[idx] / paddle::platform::complex64(abs(x_[idx]))); + output_[idx] = paddle::platform::complex(dout_[idx]) * + (x_[idx] / paddle::platform::complex(abs(x_[idx]))); } } const float* dout_; - const paddle::platform::complex64* x_; - paddle::platform::complex64* output_; + const paddle::platform::complex* x_; + paddle::platform::complex* output_; int64_t numel_; }; template <> -struct AbsGradFunctor { - AbsGradFunctor(const double* dout, const paddle::platform::complex128* x, - paddle::platform::complex128* output, int64_t numel) +struct AbsGradFunctor> { + AbsGradFunctor(const double* dout, const paddle::platform::complex* x, + paddle::platform::complex* output, int64_t numel) : dout_(dout), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex128(0)) { - output_[idx] = paddle::platform::complex128(0); + if (x_[idx] == paddle::platform::complex(0)) { + output_[idx] = paddle::platform::complex(0); } else { - output_[idx] = paddle::platform::complex128(dout_[idx]) * - (x_[idx] / paddle::platform::complex128(abs(x_[idx]))); + output_[idx] = + paddle::platform::complex(dout_[idx]) * + (x_[idx] / paddle::platform::complex(abs(x_[idx]))); } } const double* dout_; - const paddle::platform::complex128* x_; - paddle::platform::complex128* output_; + const paddle::platform::complex* x_; + paddle::platform::complex* output_; int64_t numel_; }; @@ -234,46 +235,46 @@ struct AbsGradGradFunctor { }; template <> -struct AbsGradGradFunctor { - AbsGradGradFunctor(const paddle::platform::complex128* ddx, - const paddle::platform::complex128* x, - paddle::platform::complex128* output, int64_t numel) +struct AbsGradGradFunctor> { + AbsGradGradFunctor(const paddle::platform::complex* ddx, + const paddle::platform::complex* x, + paddle::platform::complex* output, int64_t numel) : ddx_(ddx), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex128(0)) { - output_[idx] = paddle::platform::complex128(0); + if (x_[idx] == paddle::platform::complex(0)) { + output_[idx] = paddle::platform::complex(0); } else { - output_[idx] = paddle::platform::complex128(ddx_[idx]) * x_[idx] / - paddle::platform::complex128(abs(x_[idx])); + output_[idx] = paddle::platform::complex(ddx_[idx]) * x_[idx] / + paddle::platform::complex(abs(x_[idx])); } } - const paddle::platform::complex128* ddx_; - const paddle::platform::complex128* x_; - paddle::platform::complex128* output_; + const paddle::platform::complex* ddx_; + const paddle::platform::complex* x_; + paddle::platform::complex* output_; int64_t numel_; }; template <> -struct AbsGradGradFunctor { - AbsGradGradFunctor(const paddle::platform::complex64* ddx, - const paddle::platform::complex64* x, - paddle::platform::complex64* output, int64_t numel) +struct AbsGradGradFunctor> { + AbsGradGradFunctor(const paddle::platform::complex* ddx, + const paddle::platform::complex* x, + paddle::platform::complex* output, int64_t numel) : ddx_(ddx), x_(x), output_(output), numel_(numel) {} HOSTDEVICE void operator()(int64_t idx) const { - if (x_[idx] == paddle::platform::complex64(0)) { - output_[idx] = paddle::platform::complex64(0); + if (x_[idx] == paddle::platform::complex(0)) { + output_[idx] = paddle::platform::complex(0); } else { - output_[idx] = paddle::platform::complex64(ddx_[idx]) * x_[idx] / - paddle::platform::complex64(abs(x_[idx])); + output_[idx] = paddle::platform::complex(ddx_[idx]) * x_[idx] / + paddle::platform::complex(abs(x_[idx])); } } - const paddle::platform::complex64* ddx_; - const paddle::platform::complex64* x_; - paddle::platform::complex64* output_; + const paddle::platform::complex* ddx_; + const paddle::platform::complex* x_; + paddle::platform::complex* output_; int64_t numel_; }; template diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index d62c1e42d3bc44c7e028201f93948e0c227ee53e..58f936788a363e8473ea402b62fb7edc2fc83236 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include #include +#include "gflags/gflags.h" #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/concat_and_split.h" @@ -242,8 +243,28 @@ class ConcatFunctor { int in_col = input[0].numel() / in_row; int out_row = in_row, out_col = 0; - std::vector inputs_data(in_num); - std::vector inputs_col(in_num + 1); + int inputs_col_num = in_num + 1; + std::vector inputs_data_vec(in_num); + std::vector inputs_col_vec(inputs_col_num); + const T** inputs_data = inputs_data_vec.data(); + int* inputs_col = inputs_col_vec.data(); + +// There are some differences between hip runtime and NV runtime. +// In NV, when the pageable memory data less than 64K is transferred from +// hosttodevice, it will be automatically asynchronous. +// However, only pinned memory in hip can copy asynchronously +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device +// 3.2.6.1. Concurrent Execution between Host and Device +// Memory copies from host to device of a memory block of 64 KB or less +#ifdef PADDLE_WITH_HIP + memory::AllocationPtr data_alloc, col_alloc; + data_alloc = + memory::Alloc(platform::CUDAPinnedPlace(), in_num * sizeof(T*)); + inputs_data = reinterpret_cast(data_alloc->ptr()); + col_alloc = memory::Alloc(platform::CUDAPinnedPlace(), + inputs_col_num * sizeof(int)); + inputs_col = reinterpret_cast(col_alloc->ptr()); +#endif inputs_col[0] = 0; bool has_same_shape = true; @@ -264,12 +285,11 @@ class ConcatFunctor { memory::allocation::AllocationPtr tmp_dev_ins_data; const T** dev_ins_data = nullptr; if (!has_same_shape || in_num < 2 || in_num > 4) { - tmp_dev_ins_data = - memory::Alloc(context, inputs_data.size() * sizeof(T*)); + tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*)); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), tmp_dev_ins_data->ptr(), platform::CPUPlace(), - static_cast(inputs_data.data()), - inputs_data.size() * sizeof(T*), context.stream()); + static_cast(inputs_data), in_num * sizeof(T*), + context.stream()); dev_ins_data = reinterpret_cast(tmp_dev_ins_data->ptr()); } @@ -292,17 +312,29 @@ class ConcatFunctor { } } else { auto tmp_dev_ins_col_data = - memory::Alloc(context, inputs_col.size() * sizeof(int)); + memory::Alloc(context, inputs_col_num * sizeof(int)); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), - static_cast(inputs_col.data()), - inputs_col.size() * sizeof(int), context.stream()); + static_cast(inputs_col), inputs_col_num * sizeof(int), + context.stream()); int* dev_ins_col_data = static_cast(tmp_dev_ins_col_data->ptr()); ConcatKernel<<>>( - dev_ins_data, dev_ins_col_data, static_cast(inputs_col.size()), + dev_ins_data, dev_ins_col_data, static_cast(inputs_col_num), out_row, out_col, output->data()); } +#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory + // after the launch kernel of the stream is executed (reapply pinned memory + // next time) + auto* data_alloc_released = data_alloc.release(); + auto* col_alloc_released = col_alloc.release(); + context.AddStreamCallback([data_alloc_released, col_alloc_released] { + memory::allocation::AllocationDeleter deleter; + deleter(data_alloc_released); + deleter(col_alloc_released); + }); +#endif } }; @@ -313,6 +345,7 @@ class ConcatFunctor { template class SplitFunctor { public: + SplitFunctor(); void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ref_inputs, @@ -329,8 +362,27 @@ class SplitFunctor { int64_t in_col = 0, in_row = out_row; bool has_same_shape = true; - std::vector outputs_data(o_num); - std::vector outputs_cols(o_num + 1); + int outputs_cols_num = o_num + 1; + std::vector outputs_data_vec(o_num); + std::vector outputs_cols_vec(outputs_cols_num); + T** outputs_data = outputs_data_vec.data(); + int64_t* outputs_cols = outputs_cols_vec.data(); + +// There are some differences between hip runtime and NV runtime. +// In NV, when the pageable memory data less than 64K is transferred from +// hosttodevice, it will be automatically asynchronous. +// However, only pinned memory in hip can copy asynchronously +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#concurrent-execution-host-device +// 3.2.6.1. Concurrent Execution between Host and Device +// Memory copies from host to device of a memory block of 64 KB or less +#ifdef PADDLE_WITH_HIP + memory::AllocationPtr data_alloc, cols_alloc; + data_alloc = memory::Alloc(platform::CUDAPinnedPlace(), o_num * sizeof(T*)); + outputs_data = reinterpret_cast(data_alloc->ptr()); + cols_alloc = memory::Alloc(platform::CUDAPinnedPlace(), + (outputs_cols_num) * sizeof(int64_t)); + outputs_cols = reinterpret_cast(cols_alloc->ptr()); +#endif outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { @@ -354,12 +406,11 @@ class SplitFunctor { memory::allocation::AllocationPtr tmp_dev_outs_data; T** dev_out_gpu_data = nullptr; if (!has_same_shape || o_num < 2 || o_num > 4) { - tmp_dev_outs_data = - memory::Alloc(context, outputs_data.size() * sizeof(T*)); + tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*)); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), tmp_dev_outs_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs_data.data()), - outputs_data.size() * sizeof(T*), context.stream()); + reinterpret_cast(outputs_data), o_num * sizeof(T*), + context.stream()); dev_out_gpu_data = reinterpret_cast(tmp_dev_outs_data->ptr()); } @@ -382,20 +433,30 @@ class SplitFunctor { } } else { auto tmp_dev_ins_col_data = - memory::Alloc(context, - - outputs_cols.size() * sizeof(int64_t)); + memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), - reinterpret_cast(outputs_cols.data()), - outputs_cols.size() * sizeof(int64_t), context.stream()); + reinterpret_cast(outputs_cols), + outputs_cols_num * sizeof(int64_t), context.stream()); int64_t* dev_outs_col_data = reinterpret_cast(tmp_dev_ins_col_data->ptr()); SplitKernel<<>>( input.data(), in_row, in_col, dev_outs_col_data, - static_cast(outputs_cols.size()), dev_out_gpu_data); + static_cast(outputs_cols_num), dev_out_gpu_data); } +#ifdef PADDLE_WITH_HIP + // Prevent the pinned memory value from being covered and release the memory + // after the launch kernel of the stream is executed (reapply pinned memory + // next time) + auto* data_alloc_released = data_alloc.release(); + auto* cols_alloc_released = cols_alloc.release(); + context.AddStreamCallback([data_alloc_released, cols_alloc_released] { + memory::allocation::AllocationDeleter deleter; + deleter(data_alloc_released); + deleter(cols_alloc_released); + }); +#endif } }; diff --git a/paddle/fluid/operators/math/concat_and_split.h b/paddle/fluid/operators/math/concat_and_split.h index d6ad3aec22b1fed22e317b9935be56172fe0ec8d..65d2ca79e60c2ec90d879ce9818c398adc93c73c 100644 --- a/paddle/fluid/operators/math/concat_and_split.h +++ b/paddle/fluid/operators/math/concat_and_split.h @@ -65,16 +65,16 @@ class SplitFunctor { } // namespace operators } // namespace paddle -#define FOR_ALL_TYPES(macro) \ - macro(int); \ - macro(float); \ - macro(double); \ - macro(bool); \ - macro(int64_t); \ - macro(int16_t); \ - macro(uint8_t); \ - macro(int8_t); \ - macro(::paddle::platform::float16); \ - macro(::paddle::platform::bfloat16); \ - macro(::paddle::platform::complex64); \ - macro(::paddle::platform::complex128) +#define FOR_ALL_TYPES(macro) \ + macro(int); \ + macro(float); \ + macro(double); \ + macro(bool); \ + macro(int64_t); \ + macro(int16_t); \ + macro(uint8_t); \ + macro(int8_t); \ + macro(::paddle::platform::float16); \ + macro(::paddle::platform::bfloat16); \ + macro(::paddle::platform::complex); \ + macro(::paddle::platform::complex); diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc index 011c85caf04bbb3881a856caece3e3db70a055fc..c8e2acea451a473b757dcbd912bed1e9970e0bd1 100644 --- a/paddle/fluid/operators/math/concat_test.cc +++ b/paddle/fluid/operators/math/concat_test.cc @@ -437,6 +437,8 @@ void TestConcatMain() { ConcatCase2(context); ConcatCase3(context); ConcatCase4(context); + + delete context; } TEST(math, concat) { diff --git a/paddle/fluid/operators/math/functors.h b/paddle/fluid/operators/math/functors.h index bf64d7e8ceb23dacba5cbe226549a19b898cfa8d..054018b10e87e421c45846abf550f0f7a552f6a3 100644 --- a/paddle/fluid/operators/math/functors.h +++ b/paddle/fluid/operators/math/functors.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math.h" namespace paddle { @@ -40,6 +41,11 @@ struct AddFunctor { inline HOSTDEVICE T operator()(T x, T y) { return x + y; } }; +template +struct MaxFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? b : a; } +}; + template struct AddGradFunctor { inline HOSTDEVICE T Dx(T x, T y) { return static_cast(1.); } @@ -130,6 +136,63 @@ struct SigmoidGradFunctor { } }; +template +struct GeluFunctor { + using MT = typename details::MPTypeTrait::Type; + inline HOSTDEVICE T operator()(T x) { + // this function is tanh approximation of gelu + // actual gelu is: + // x * 0.5 * (1.0 + torch.erf(x * 0.70710678)) + MT mx = static_cast(x); + MT out = mx * static_cast(0.5) * + (static_cast(1.0) + + tanh(static_cast(0.79788456) * mx * + (static_cast(1) + static_cast(0.044715) * mx * mx))); + return static_cast(out); + } +}; + +template +struct GeluGradFunctor { + using MT = typename details::MPTypeTrait::Type; + inline HOSTDEVICE T UseX(T x) { + MT mx = static_cast(x); + MT tanh_out = + tanh(static_cast(0.79788456) * mx * + (static_cast(1) + static_cast(0.044715) * mx * mx)); + MT ans = static_cast(0.5) * mx * + ((static_cast(1) - tanh_out * tanh_out) * + (static_cast(0.79788456) + + static_cast(0.1070322243) * mx * mx)) + + static_cast(0.5) * (static_cast(1) + tanh_out); + return static_cast(ans); + } + inline HOSTDEVICE T UseOut(T x) { + MT mx = static_cast(x); + MT tanh_out = + tanh(static_cast(0.79788456) * mx * + (static_cast(1) + static_cast(0.044715) * mx * mx)); + MT ans = static_cast(0.5) * mx * + ((static_cast(1) - tanh_out * tanh_out) * + (static_cast(0.79788456) + + static_cast(0.1070322243) * mx * mx)) + + static_cast(0.5) * (static_cast(1) + tanh_out); + return static_cast(ans); + } + inline HOSTDEVICE T UseXAndOut(T x, T out) { + MT mx = static_cast(x); + MT tanh_out = + tanh(static_cast(0.79788456) * mx * + (static_cast(1) + static_cast(0.044715) * mx * mx)); + MT ans = static_cast(0.5) * mx * + ((static_cast(1) - tanh_out * tanh_out) * + (static_cast(0.79788456) + + static_cast(0.1070322243) * mx * mx)) + + static_cast(0.5) * (static_cast(1) + tanh_out); + return static_cast(ans); + } +}; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/math_cuda_utils.h b/paddle/fluid/operators/math/math_cuda_utils.h index e97dbd20ca142af75420ccf3ce349c1bdc928b09..8de4e8221c0e473e4577cf897762b8773f50ebb3 100644 --- a/paddle/fluid/operators/math/math_cuda_utils.h +++ b/paddle/fluid/operators/math/math_cuda_utils.h @@ -188,6 +188,7 @@ __inline__ __device__ T blockReduceSum(T val, unsigned mask) { val = warpReduceSum(val, mask); + __syncthreads(); if (lane == 0) shared[wid] = val; __syncthreads(); diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 0bdc7b69434221ffd91b0df94287df0eae42d89b..1266ee7462d2d5cca38905bcfde54932f0f8efb5 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -45,8 +45,10 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; #ifdef PADDLE_WITH_XPU template struct SetConstant; @@ -57,27 +59,29 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; #endif -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, RANK>; \ + template struct Transpose, RANK>; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); @@ -128,8 +132,8 @@ DEFINE_CPU_TRANS_NORMAL(bool); DEFINE_CPU_TRANS_NORMAL(int16_t); DEFINE_CPU_TRANS_NORMAL(uint8_t); DEFINE_CPU_TRANS_NORMAL(int8_t); -DEFINE_CPU_TRANS_NORMAL(platform::complex64); -DEFINE_CPU_TRANS_NORMAL(platform::complex128); +DEFINE_CPU_TRANS_NORMAL(platform::complex); +DEFINE_CPU_TRANS_NORMAL(platform::complex); struct TensorSetConstantCPU { TensorSetConstantCPU(framework::Tensor* tensor, float value) @@ -158,6 +162,14 @@ void set_constant_with_place( PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); } +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW( + platform::errors::Unimplemented("NPUPinnedPlace is not supported")); +} + template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index f94c1bf696cdad5727fcf9ae659c1430b0f8bef4..248f62129991328fd59886192bd7de95bf2b3037 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -20,8 +20,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -30,8 +28,6 @@ namespace math { using float16 = paddle::platform::float16; using bfloat16 = paddle::platform::bfloat16; -using complex64 = paddle::platform::complex64; -using complex128 = paddle::platform::complex128; template struct SetConstant; template struct SetConstant; @@ -41,19 +37,23 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant>; +template struct SetConstant>; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose, RANK>; \ + template struct Transpose, RANK>; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); @@ -143,8 +143,8 @@ DEFINE_GPU_TRANS_NORMAL(bool); DEFINE_GPU_TRANS_NORMAL(int16_t); DEFINE_GPU_TRANS_NORMAL(uint8_t); DEFINE_GPU_TRANS_NORMAL(int8_t); -DEFINE_GPU_TRANS_NORMAL(complex64); -DEFINE_GPU_TRANS_NORMAL(complex128); +DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); +DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex); struct TensorSetConstantGPU { TensorSetConstantGPU(const platform::DeviceContext& context, diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/fluid/operators/math/math_function_test.cc index 3388d7edafecc4c0dd3a041316dc6f171d035319..32f9938dcacfbb0d314da912dc217949a544ea9b 100644 --- a/paddle/fluid/operators/math/math_function_test.cc +++ b/paddle/fluid/operators/math/math_function_test.cc @@ -208,6 +208,7 @@ void GemvTest(int m, int n, bool trans) { ASSERT_FLOAT_EQ(data_c[i], sum); } } + delete cpu_place; } TEST(math_function, gemv) { @@ -274,6 +275,7 @@ void GemmWarpTest(int m, int n, int k, T alpha, T beta) { for (int i = 0; i < mat_c_mkl.numel(); ++i) { EXPECT_FLOAT_EQ(CREF[i], CMKL[i]); } + delete cpu_place; } TEST(math_function, gemm_warp) { diff --git a/paddle/fluid/operators/math/padding.h b/paddle/fluid/operators/math/padding.h index 379b21c3c18888989663221052e6e99df80e7e9d..529d39c9ba50f016434b0b14c4d85c84483bad7f 100644 --- a/paddle/fluid/operators/math/padding.h +++ b/paddle/fluid/operators/math/padding.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -29,7 +30,7 @@ template void PadFunction(const framework::ExecutionContext& context, const std::vector& pads, const framework::Tensor& src, T pad_value, framework::Tensor* out) { - Eigen::array, D> paddings; + std::array, D> paddings; for (size_t i = 0; i < paddings.size(); ++i) { paddings[i].first = pads[i * 2]; @@ -41,14 +42,15 @@ void PadFunction(const framework::ExecutionContext& context, auto& place = *context.template device_context().eigen_device(); - out_tensor.device(place) = src_tensor.pad(paddings, pad_value); + EigenPad, T, D>::Eval( + place, out_tensor, src_tensor, paddings, pad_value); } template void PadGradFunction(const framework::ExecutionContext& context, const std::vector& pads, const framework::Tensor& src, framework::Tensor* d_out) { - Eigen::array, D> paddings; + std::array, D> paddings; for (size_t i = 0; i < paddings.size(); ++i) { paddings[i].first = -pads[i * 2]; paddings[i].second = -pads[i * 2 + 1]; @@ -58,7 +60,8 @@ void PadGradFunction(const framework::ExecutionContext& context, auto src_tensor = EigenTensor::From(src); auto& place = *context.template device_context().eigen_device(); - d_out_tensor.device(place) = src_tensor.pad(paddings, static_cast(0)); + EigenPad, T, D>::Eval( + place, d_out_tensor, src_tensor, paddings, static_cast(0)); } template diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu index 0b615cefac4eed2b2d972d5ed4b0e3a728d55486..b49b5036ac42e2359a2840f48ab0a42ced6bc406 100644 --- a/paddle/fluid/operators/math/segment_pooling.cu +++ b/paddle/fluid/operators/math/segment_pooling.cu @@ -25,14 +25,12 @@ namespace operators { using Tensor = framework::Tensor; template -__global__ void SegmentMeanCustomKernel( - const Index* segment_ids, const T* input, T* output, T* summed_ids, - const Index input_length_size, const Index inner_dim_size, - const Index output_length_size, const Index total_stripe_count) { +__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids, + const Index input_length_size, + const Index total_stripe_count) { CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) { - const Index segment_offset = stripe_index % inner_dim_size; - const Index dim_index_base = - stripe_index / inner_dim_size * Index(DimTileSize); + const Index segment_offset = stripe_index; + const Index dim_index_base = stripe_index * Index(DimTileSize); const Index actual_height = min(Index(DimTileSize), input_length_size - dim_index_base); @@ -41,19 +39,20 @@ __global__ void SegmentMeanCustomKernel( if (dim_index_base > 0) { last_segment_id = segment_ids[dim_index_base - 1]; } - if (segment_offset == 0) { - T sum = T(0); - for (Index j = 0; j < actual_height; j++) { - Index current_segment_id = segment_ids[dim_index_base + j]; - // Note(ZHUI): following check may cause - // cudaErrorLaunchOutOfResources. - // PADDLE_ENFORCE(current_segment_id >= last_segment_id, - // "the segment ids should be sorted, but got " - // "segment_ids[%d]:%d > segment_ids[%d]:%d.", - // dim_index_base + j - 1, dim_index_base + j, - // last_segment_id, current_segment_id); - - if (j > 0 && current_segment_id > last_segment_id) { + T sum = T(0); + for (Index j = 0; j < actual_height; j++) { + Index current_segment_id = segment_ids[dim_index_base + j]; + PADDLE_ENFORCE(current_segment_id >= last_segment_id, + "the segment ids should be sorted, but got " + "segment_ids[%d]:%d > segment_ids[%d]:%d.", + dim_index_base + j - 1, dim_index_base + j, + last_segment_id, current_segment_id); + if (current_segment_id > last_segment_id) { + for (Index interval_id = last_segment_id + 1; + interval_id < current_segment_id; ++interval_id) { + *(summed_ids + interval_id) = 0; + } + if (j > 0) { if (last_segment_id == first_segment_id) { platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); } else { @@ -61,33 +60,60 @@ __global__ void SegmentMeanCustomKernel( } sum = T(0); } - sum += T(1); - last_segment_id = current_segment_id; } - platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); + sum += T(1); + last_segment_id = current_segment_id; + } + platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); + } +} + +template +__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input, + T* output, T* summed_ids, + const Index input_length_size, + const Index inner_dim_size, + const Index output_length_size, + const Index total_stripe_count) { + CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) { + const Index segment_offset = stripe_index % inner_dim_size; + const Index dim_index_base = + stripe_index / inner_dim_size * Index(DimTileSize); + const Index actual_height = + min(Index(DimTileSize), input_length_size - dim_index_base); + + Index first_segment_id = segment_ids[dim_index_base]; + Index last_segment_id = -1; + if (dim_index_base > 0) { + last_segment_id = segment_ids[dim_index_base - 1]; } - // ensure last_segment_id is the largest - last_segment_id = output_length_size; - __syncthreads(); T sum = T(0); for (Index j = 0; j < actual_height; j++) { Index current_segment_id = segment_ids[dim_index_base + j]; if (current_segment_id > last_segment_id) { - const Index output_index = - last_segment_id * inner_dim_size + segment_offset; - if (last_segment_id == first_segment_id) { - platform::CudaAtomicAdd(output + output_index, - sum / *(summed_ids + last_segment_id)); - } else { - *(output + output_index) = sum / *(summed_ids + last_segment_id); + // reset the interval value which do not have corresponding ids. + for (Index interval_id = last_segment_id + 1; + interval_id < current_segment_id; ++interval_id) { + *(output + interval_id * inner_dim_size + segment_offset) = T(0); + } + + if (j > 0) { + Index output_index = + last_segment_id * inner_dim_size + segment_offset; + + if (last_segment_id == first_segment_id) { + platform::CudaAtomicAdd(output + output_index, + sum / *(summed_ids + last_segment_id)); + } else { + *(output + output_index) = sum / *(summed_ids + last_segment_id); + } + sum = T(0); } - sum = T(0); } sum += input[(dim_index_base + j) * inner_dim_size + segment_offset]; last_segment_id = current_segment_id; } - const Index output_index = - last_segment_id * inner_dim_size + segment_offset; + Index output_index = last_segment_id * inner_dim_size + segment_offset; platform::CudaAtomicAdd(output + output_index, sum / *(summed_ids + last_segment_id)); } @@ -122,7 +148,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input, // reset the interval value which do not have corresponding ids. for (Index interval_id = last_segment_id + 1; interval_id < current_segment_id; ++interval_id) { - *(output + interval_id * inner_dim_size + segment_offset) = 0; + *(output + interval_id * inner_dim_size + segment_offset) = T(0); } // don't update result when j=0 if (j > 0) { @@ -272,11 +298,25 @@ class SegmentPoolFunctor { framework::Tensor* output, framework::Tensor* summed_ids = nullptr, const std::string pooltype = "SUM") { + if (pooltype == "MEAN") { + // Sum the segment id num first + T DimTileSize = 8; + auto input_length_size = segment_ids.numel(); + auto total_stripe_count = + (input_length_size + DimTileSize - 1) / DimTileSize; + auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count); + SegmentSumIdsKernel< + T, IndexT, IndexT(8)><<>>( + segment_ids.data(), summed_ids->data(), input_length_size, + total_stripe_count); + } + auto h = ArrangeHelper(input.numel(), segment_ids.dims()[0], output->dims()[0]); auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count); if (pooltype == "MEAN") { - SegmentMeanCustomKernel< + SegmentMeanKernel< T, IndexT, IndexT(8)><<>>( segment_ids.data(), input.data(), output->data(), diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index f7b16453e0133b060e5040c1130c0a3bca556568..757cac4e4ffce442677eac99bc932f08e6b1cac1 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -14,6 +14,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/selected_rows_functor.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#endif + namespace paddle { namespace operators { namespace math { @@ -285,6 +289,8 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; // This is a separated namespace for manipulate SelectedRows typed // data. Like merge duplicated rows, adding two SelectedRows etc. @@ -294,21 +300,31 @@ template struct SelectedRowsAddToTensor; // add or mul. namespace scatter { -template -typename std::enable_if< - std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_to(const DeviceContext& ctx, BlasT* blas, - size_t data_len, const T* in, T* out) { - blas->AXPY(data_len, 1., in, out); +template +typename std::enable_if::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { +#ifdef PADDLE_WITH_MKLDNN + onednn_handler_axpy(data_len, T(1.f), in, out); +#else + blas->AXPY(data_len, T(1.f), in, out); +#endif } -template -typename std::enable_if< - !std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_to(const DeviceContext& ctx, BlasT* blas, - size_t data_len, const T* in, T* out) { +template +typename std::enable_if::value || + std::is_same::value || + std::is_same>::value || + std::is_same>::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { + blas->AXPY(data_len, T(1.f), in, out); +} + +template +typename std::enable_if::value>::type elementwise_add_to( + BlasT* blas, size_t data_len, const T* in, + T* out) { for (size_t i = 0; i < data_len; i++) { out[i] += in[i]; } @@ -412,7 +428,7 @@ struct MergeAdd { out.set_rows(merge_rows); math::SetConstant constant_functor; - constant_functor(context, out.mutable_value(), 0.0); + constant_functor(context, out.mutable_value(), static_cast(0.f)); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -429,9 +445,9 @@ struct MergeAdd { for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id[input_rows[i]]; - elementwise_add_to( - context, &blas, static_cast(input_width), - &input_data[i * input_width], &out_data[out_i * input_width]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); } } } @@ -524,9 +540,9 @@ struct MergeAverage { for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id[input_rows[i]]; - elementwise_add_to( - context, &blas, static_cast(input_width), - &input_data[i * input_width], &out_data[out_i * input_width]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); } } size_t input_width_cast = static_cast(input_width); @@ -544,9 +560,11 @@ template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; + paddle::platform::complex>; +template struct MergeAdd>; template struct MergeAdd; + paddle::platform::bfloat16>; template struct MergeAverage; template struct MergeAverage; diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 26e9a0de606babfc325de58ba73404191751411c..f3ef537a31b44c70000020f8d1a54c63ba156bc6 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -448,8 +448,9 @@ template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; -template struct MergeAdd; -template struct MergeAdd; +template struct MergeAdd>; +template struct MergeAdd>; template __global__ void UpdateToTensorKernel(const T* selected_rows, diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index d78e3385efb29cbba540d50433bf0fe35cedd448..a73f76f53be052f1d884538f70810be76cacc0bc 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -87,7 +87,11 @@ class Unpool2dMaxFunctor { const T* input_data = input.data(); const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); +#ifdef __HIPCC__ + int threads = 256; +#else int threads = 1024; +#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax<<>>( input.numel(), input_data, indices_data, input_height, input_width, @@ -117,7 +121,11 @@ class Unpool2dMaxGradFunctor { const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +#ifdef __HIPCC__ + int threads = 256; +#else int threads = 1024; +#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad<<>>( input.numel(), input_data, indices_data, input_height, input_width, diff --git a/paddle/fluid/operators/math/vol2col_test.cc b/paddle/fluid/operators/math/vol2col_test.cc index cc3b838cbcf1d7a8be016cef91afdd22ef6b1a28..5a8e7fcc2a76c29ce02f856be007ddfc13f3e09f 100644 --- a/paddle/fluid/operators/math/vol2col_test.cc +++ b/paddle/fluid/operators/math/vol2col_test.cc @@ -116,6 +116,9 @@ void testVol2col() { for (int i = 0; i < 12; ++i) { EXPECT_EQ(in_ptr[i], col_2_vol[i]); } + + delete place; + delete context; } TEST(math, vol2col) { diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index c12aecc9ba5160b532c5bb35e2564209946b7f42..988a6c4f7da997277635ab3955dd62b9c93c9171 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -232,7 +232,9 @@ class MatMulGradKernel : public framework::OpKernel { int head_number = 1; #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ !defined(PADDLE_WITH_HIP) - head_number = context.Attr("head_number"); + if (context.HasAttr("head_number")) { + head_number = context.Attr("head_number"); + } #endif if (head_number <= 1 && a.dims().size() == 3 && b.dims().size() <= 2) { @@ -825,6 +827,21 @@ class MatMulOpGrad : public framework::OperatorWithKernel { context->SetOutputDim(y_grad_name, y_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; template diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc index 6fa96aca4be147e9d70c6e62500acaae88822315..7097b5327d86fab115ff85fd114dce6dd9e5ae2f 100644 --- a/paddle/fluid/operators/matmul_op_xpu.cc +++ b/paddle/fluid/operators/matmul_op_xpu.cc @@ -102,6 +102,7 @@ template static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, bool trans_x, bool trans_y, const paddle::framework::ExecutionContext &ctx) { + using XPUType = typename XPUTypeTrait::Type; const auto &x_dims = x->dims(); const auto &y_dims = y->dims(); auto &dev_ctx = @@ -162,34 +163,36 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, int ldout = n; if (batch_size <= 1) { int r = 0; - r = xpu::fc_fusion( - dev_ctx.x_context(), x->data(), y->data(), data_c, m, n, k, - mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, - ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); + r = xpu::fc_fusion( + dev_ctx.x_context(), reinterpret_cast(x->data()), + reinterpret_cast(y->data()), + reinterpret_cast(data_c), m, n, k, mat_dim_a.trans_, + mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0, + nullptr, xpu::Activation_t::LINEAR); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU fc_fusion kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); } else { // batch matmul - int r = xpu::fc_batched( - dev_ctx.x_context(), // Context* ctx, - batch_size, // int batch_size, - mat_dim_a.trans_, // bool x_trans, - mat_dim_b.trans_, // bool w_trans, - m, // int m, - n, // int n, - k, // int k, - alpha, // float alpha, - reinterpret_cast(x->data()), // const TX* x, - mat_dim_a.stride_, // int stride_a, - reinterpret_cast(y->data()), // const TW* w, - mat_dim_b.stride_, // int stride_b, - 0.0, // float beta, - reinterpret_cast(data_c), // TY* y, - m * n, // int stride_c, - nullptr, // const float* x_maxptr, - nullptr); // const float* w_maxptr + int r = xpu::fc_batched( + dev_ctx.x_context(), // Context* ctx, + batch_size, // int batch_size, + mat_dim_a.trans_, // bool x_trans, + mat_dim_b.trans_, // bool w_trans, + m, // int m, + n, // int n, + k, // int k, + alpha, // float alpha, + reinterpret_cast(x->data()), // const TX* x, + mat_dim_a.stride_, // int stride_a, + reinterpret_cast(y->data()), // const TW* w, + mat_dim_b.stride_, // int stride_b, + 0.0, // float beta, + reinterpret_cast(data_c), // TY* y, + m * n, // int stride_c, + nullptr, // const float* x_maxptr, + nullptr); // const float* w_maxptr PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( @@ -210,10 +213,14 @@ class MatMulXPUKernel : public framework::OpKernel { out->mutable_data(context.GetPlace()); bool trans_x = context.Attr("transpose_X"); bool trans_y = context.Attr("transpose_Y"); - if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, context); - } else { + if (std::is_same::value) { MatMulXPUFunction(x, y, out, trans_x, trans_y, context); + } else { + if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) { + MatMulXPUFunction(x, y, out, trans_x, trans_y, context); + } else { + MatMulXPUFunction(x, y, out, trans_x, trans_y, context); + } } } }; @@ -224,6 +231,7 @@ class MatMulXPUKernel : public framework::OpKernel { template static framework::Tensor XPUFoldHeadAndLastDims( const DeviceContext &context, const framework::Tensor &input) { + using XPUType = typename XPUTypeTrait::Type; auto in_dims = input.dims(); if (in_dims.size() != 3) { return input; @@ -236,8 +244,9 @@ static framework::Tensor XPUFoldHeadAndLastDims( static_cast(in_dims[1]), static_cast(in_dims[2])}; std::vector axis_host = {1, 0, 2}; - int r = xpu::transpose(context.x_context(), input.data(), output.data(), - in_shape_host, axis_host); + int r = xpu::transpose( + context.x_context(), reinterpret_cast(input.data()), + reinterpret_cast(output.data()), in_shape_host, axis_host); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU transpose kernel return wrong value[%d %s]", r, @@ -280,10 +289,14 @@ class MatMulGradXPUKernel : public framework::OpKernel { const framework::Tensor &b, bool trans_b, framework::Tensor *out) const { out->mutable_data(context.GetPlace()); - if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); - } else { + if (std::is_same::value) { MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); + } else { + if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) { + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); + } else { + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); + } } } @@ -370,10 +383,14 @@ class MatMulGradXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; REGISTER_OP_XPU_KERNEL( - matmul, ops::MatMulXPUKernel); + matmul, ops::MatMulXPUKernel, + ops::MatMulXPUKernel); REGISTER_OP_XPU_KERNEL( matmul_grad, - ops::MatMulGradXPUKernel); + ops::MatMulGradXPUKernel, + ops::MatMulGradXPUKernel); #endif diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 6fccd3657af77eced2d11e97b96c865f6ab92e43..8ac81596a36d3fc417cd54cb880568f69491617e 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -85,9 +85,17 @@ class MatMulV2Op : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto data_type = + auto input_data_type = OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "X", "Y"); - return framework::OpKernelType(data_type, ctx.device_context()); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( @@ -118,6 +126,14 @@ class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker { "Set true to transpose the last two dimensions of Y before " "doing multiplication") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); AddComment( R"DOC(Matrix multiplication Out = X * Y. A has shape (d0, d1 ... M, K), B has shape (d0, d1 ... K, N), Out has shape ((d0, d1 ... M, N)). @@ -204,15 +220,15 @@ REGISTER_OP_CPU_KERNEL( matmul_v2, ops::MatMulV2Kernel, ops::MatMulV2Kernel, ops::MatMulV2Kernel, + paddle::platform::complex>, ops::MatMulV2Kernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( matmul_v2_grad, ops::MatMulV2GradKernel, ops::MatMulV2GradKernel, ops::MatMulV2GradKernel, + paddle::platform::complex>, ops::MatMulV2GradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/matmul_v2_op.cu b/paddle/fluid/operators/matmul_v2_op.cu index e819398ec9be9fec0dae9e35d1dbc414d0cc9cb3..2176ab79dd919dec17ca15c0297c87bf2a47e85e 100644 --- a/paddle/fluid/operators/matmul_v2_op.cu +++ b/paddle/fluid/operators/matmul_v2_op.cu @@ -21,12 +21,12 @@ REGISTER_OP_CUDA_KERNEL( matmul_v2, ops::MatMulV2Kernel, ops::MatMulV2Kernel, ops::MatMulV2Kernel, - ops::MatMulV2Kernel, - ops::MatMulV2Kernel); + ops::MatMulV2Kernel>, + ops::MatMulV2Kernel>); REGISTER_OP_CUDA_KERNEL( matmul_v2_grad, ops::MatMulV2GradKernel, ops::MatMulV2GradKernel, ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel, - ops::MatMulV2GradKernel); + ops::MatMulV2GradKernel>, + ops::MatMulV2GradKernel>); diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h index ca20efaad074d76271e6c06992dcf0cc53a8739a..5b114f381996e610f8d220e37661a3bfa059104d 100644 --- a/paddle/fluid/operators/matmul_v2_op.h +++ b/paddle/fluid/operators/matmul_v2_op.h @@ -34,11 +34,13 @@ namespace operators { using framework::Tensor; -template struct IdentityFunctor { HOSTDEVICE explicit inline IdentityFunctor() {} - HOSTDEVICE inline T operator()(const T& x) const { return x; } + template + HOSTDEVICE inline U operator()(const U& x) const { + return x; + } }; template @@ -47,9 +49,9 @@ void ReduceSumForMatmulGrad(const Tensor* input, Tensor* output, const paddle::framework::ExecutionContext& ctx) { #if defined(__NVCC__) || defined(__HIPCC__) auto stream = ctx.cuda_device_context().stream(); - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - IdentityFunctor(), stream); + TensorReduce(*input, output, reduce_dims, + static_cast(0), cub::Sum(), + IdentityFunctor(), stream); #else ReduceKernelFunctor( input, output, reduce_dims, true, false, ctx) @@ -483,19 +485,19 @@ struct ConjHelper { }; template -struct ConjHelper { +struct ConjHelper> { explicit ConjHelper(const framework::ExecutionContext& ctx) : ctx_(ctx) {} HOSTDEVICE void operator()(framework::Tensor& src, framework::Tensor& dst) { dst.Resize(src.dims()); - auto* src_data = src.data(); - auto* dst_data = dst.mutable_data( + auto* src_data = src.data>(); + auto* dst_data = dst.mutable_data>( ctx_.GetPlace(), - size_t(src.numel() * sizeof(paddle::platform::complex64))); + size_t(src.numel() * sizeof(paddle::platform::complex))); platform::ForRange for_range( ctx_.template device_context(), src.numel()); - math::ConjFunctor functor( + math::ConjFunctor> functor( src_data, src.numel(), dst_data); for_range(functor); return; @@ -504,19 +506,19 @@ struct ConjHelper { }; template -struct ConjHelper { +struct ConjHelper> { explicit ConjHelper(const framework::ExecutionContext& ctx) : ctx_(ctx) {} HOSTDEVICE void operator()(framework::Tensor& src, framework::Tensor& dst) { dst.Resize(src.dims()); - auto* src_data = src.data(); - auto* dst_data = dst.mutable_data( + auto* src_data = src.data>(); + auto* dst_data = dst.mutable_data>( ctx_.GetPlace(), - size_t(src.numel() * sizeof(paddle::platform::complex128))); + size_t(src.numel() * sizeof(paddle::platform::complex))); platform::ForRange for_range( ctx_.template device_context(), src.numel()); - math::ConjFunctor functor( + math::ConjFunctor> functor( src_data, src.numel(), dst_data); for_range(functor); return; diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index d3022056a47ded99e63aa05c1aca8e9b31ccc3fe..3d77c177500e384e0fa344a70b93ae2ae7582b56 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -34,7 +34,7 @@ class MatMulV2NPUKernel : public framework::OpKernel { if (x->dims().size() == 2) { out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner( + const auto& runner = NpuOpRunner( "MatMul", {*x, *y}, {*out}, {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); @@ -46,7 +46,7 @@ class MatMulV2NPUKernel : public framework::OpKernel { } else if (x->dims().size() > 2) { out->mutable_data(ctx.GetPlace()); - auto runner = + const auto& runner = NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); @@ -76,7 +76,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel { if (transpose_y) { if (dx) { dx->mutable_data(ctx.GetPlace()); - auto runner_dx = + const auto& runner_dx = NpuOpRunner("MatMul", {*dout, *y}, {*dx}, {{"transpose_x1", false}, {"transpose_x2", false}}); @@ -84,7 +84,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel { } if (dy) { dy->mutable_data(ctx.GetPlace()); - auto runner_dy = + const auto& runner_dy = NpuOpRunner("MatMul", {*dout, *x}, {*dy}, {{"transpose_x1", true}, {"transpose_x2", false}}); @@ -94,7 +94,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel { } else { if (dx) { dx->mutable_data(ctx.GetPlace()); - auto runner_dx = + const auto& runner_dx = NpuOpRunner("MatMul", {*dout, *y}, {*dx}, {{"transpose_x1", false}, {"transpose_x2", true}}); @@ -102,7 +102,7 @@ class MatMulV2GradNPUKernel : public framework::OpKernel { } if (dy) { dy->mutable_data(ctx.GetPlace()); - auto runner_dy = + const auto& runner_dy = NpuOpRunner("MatMul", {*x, *dout}, {*dy}, {{"transpose_x1", true}, {"transpose_x2", false}}); @@ -113,31 +113,55 @@ class MatMulV2GradNPUKernel : public framework::OpKernel { if (transpose_y) { if (dx) { dx->mutable_data(ctx.GetPlace()); - auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", false}}); + const auto& runner_dx = + NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, + {{"adj_x1", false}, {"adj_x2", false}}); runner_dx.Run(stream); } if (dy) { dy->mutable_data(ctx.GetPlace()); - auto runner_dy = NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); + const auto& runner_dy = + NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, + {{"adj_x1", true}, {"adj_x2", false}}); runner_dy.Run(stream); } } else { if (dx) { dx->mutable_data(ctx.GetPlace()); - auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); + const auto& runner_dx = + NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, + {{"adj_x1", false}, {"adj_x2", true}}); runner_dx.Run(stream); } if (dy) { dy->mutable_data(ctx.GetPlace()); - auto runner_dy = NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); - runner_dy.Run(stream); + if ((x->dims().size() == 3) && (dout->dims().size() == 3) && + (dy->dims().size() == 2)) { + framework::Tensor dout_; + dout_.ShareDataWith(*dout); + std::vector vec_dim = framework::vectorize(dout_.dims()); + std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; + dout_.Resize(framework::make_ddim(vec_dim_v)); + + framework::Tensor x_; + x_.ShareDataWith(*x); + std::vector vec_dim_x = framework::vectorize(x_.dims()); + std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], + vec_dim_x[2]}; + x_.Resize(framework::make_ddim(vec_dim_x_v)); + const auto& runner_dy = + NpuOpRunner("MatMul", {x_, dout_}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + runner_dy.Run(stream); + } else { + const auto& runner_dy = + NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, + {{"adj_x1", true}, {"adj_x2", false}}); + runner_dy.Run(stream); + } } } } diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index d992ef847db2aca8bc284781fdd1408d36bd14e5..ae1e9358f68115e4952696325051d142a25789f8 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -25,6 +25,7 @@ template static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, bool trans_x, bool trans_y, const paddle::framework::ExecutionContext& ctx) { + using XPUType = typename XPUTypeTrait::Type; const auto& x_dims = x->dims(); const auto& y_dims = y->dims(); auto& dev_ctx = @@ -75,9 +76,11 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, int batch_size = mat_dim_a.batch_size_; if (batch_size <= 1) { int r = 0; - r = xpu::fc(dev_ctx.x_context(), x->data(), y->data(), - data_c, m, n, k, mat_dim_a.trans_, - mat_dim_b.trans_, nullptr, nullptr, nullptr); + r = xpu::fc( + dev_ctx.x_context(), reinterpret_cast(x->data()), + reinterpret_cast(y->data()), + reinterpret_cast(data_c), m, n, k, mat_dim_a.trans_, + mat_dim_b.trans_, nullptr, nullptr, nullptr); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External( @@ -87,24 +90,24 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_)); } else { // batch matmul - int r = xpu::fc_batched( - dev_ctx.x_context(), // Context* ctx, - batch_size, // int batch_size, - mat_dim_a.trans_, // bool x_trans, - mat_dim_b.trans_, // bool w_trans, - m, // int m, - n, // int n, - k, // int k, - 1.0, // float alpha, - reinterpret_cast(x->data()), // const TX* x, - mat_dim_a.stride_, // int stride_a, - reinterpret_cast(y->data()), // const TW* w, - mat_dim_b.stride_, // int stride_b, - 0.0, // float beta, - reinterpret_cast(data_c), // TY* y, - m * n, // int stride_c, - nullptr, // const float* x_maxptr, - nullptr); // const float* w_maxptr + int r = xpu::fc_batched( + dev_ctx.x_context(), // Context* ctx, + batch_size, // int batch_size, + mat_dim_a.trans_, // bool x_trans, + mat_dim_b.trans_, // bool w_trans, + m, // int m, + n, // int n, + k, // int k, + 1.0, // float alpha, + reinterpret_cast(x->data()), // const TX* x, + mat_dim_a.stride_, // int stride_a, + reinterpret_cast(y->data()), // const TW* w, + mat_dim_b.stride_, // int stride_b, + 0.0, // float beta, + reinterpret_cast(data_c), // TY* y, + m * n, // int stride_c, + nullptr, // const float* x_maxptr, + nullptr); // const float* w_maxptr PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( @@ -123,10 +126,14 @@ class MatMulV2XPUKernel : public framework::OpKernel { bool trans_x = ctx.Attr("trans_x"); bool trans_y = ctx.Attr("trans_y"); out->mutable_data(ctx.GetPlace()); - if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) { - MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); - } else { + if (std::is_same::value) { MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); + } else { + if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) { + MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); + } else { + MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); + } } } }; @@ -134,6 +141,7 @@ class MatMulV2XPUKernel : public framework::OpKernel { template static framework::Tensor XPUFoldHeadAndLastDims( const DeviceContext& context, const framework::Tensor& input) { + using XPUType = typename XPUTypeTrait::Type; auto in_dims = input.dims(); if (in_dims.size() != 3) { return input; @@ -147,8 +155,9 @@ static framework::Tensor XPUFoldHeadAndLastDims( static_cast(in_dims[2])}; std::vector axis_host = {1, 0, 2}; - int r = xpu::transpose(context.x_context(), input.data(), output.data(), - in_shape_host, axis_host); + int r = xpu::transpose( + context.x_context(), reinterpret_cast(input.data()), + reinterpret_cast(output.data()), in_shape_host, axis_host); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU transpose kernel return wrong value[%d %s]", r, @@ -166,10 +175,14 @@ class MatMulV2XPUGradKernel : public framework::OpKernel { const framework::Tensor& b, bool trans_b, framework::Tensor* out) const { out->mutable_data(ctx.GetPlace()); - if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) { - MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); - } else { + if (std::is_same::value) { MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); + } else { + if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) { + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); + } else { + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); + } } } @@ -261,8 +274,10 @@ class MatMulV2XPUGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; - -REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel); -REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel); +namespace plat = paddle::platform; +REGISTER_OP_XPU_KERNEL(matmul_v2, ops::MatMulV2XPUKernel, + ops::MatMulV2XPUKernel); +REGISTER_OP_XPU_KERNEL(matmul_v2_grad, ops::MatMulV2XPUGradKernel, + ops::MatMulV2XPUGradKernel); #endif diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc index d6e982039fa290ae9095fe380fa22955c6acde70..ab0a3336b361f8c7127019e424b2bf72c6b35385 100644 --- a/paddle/fluid/operators/mean_op_npu.cc +++ b/paddle/fluid/operators/mean_op_npu.cc @@ -30,7 +30,7 @@ class MeanNPUKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input); + const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input); auto stream = ctx.template device_context() @@ -61,7 +61,7 @@ class MeanGradNPUKernel : public framework::OpKernel { // ones Tensor ones(grad->type()); ones.mutable_data(IG->dims(), context.GetPlace()); - auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {}); + const auto& runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {}); runner_ones.Run(stream); // means @@ -75,11 +75,12 @@ class MeanGradNPUKernel : public framework::OpKernel { Tensor mean_ma(grad->type()); mean_ma.Resize(IG->dims()); mean_ma.mutable_data(context.GetPlace()); - auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {}); + const auto& runner_mul_1 = + NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {}); runner_mul_1.Run(stream); // and mul grad - auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {}); + const auto& runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {}); runner_mul_2.Run(stream); } }; diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc index 4e10498efa10c4ca48f3bcc51391c9df00e4f962..ecd2d48dcbd102baffaccfd5de369462b5f8e527 100644 --- a/paddle/fluid/operators/memcpy_op.cc +++ b/paddle/fluid/operators/memcpy_op.cc @@ -141,7 +141,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, plat::float16, ops::MemcpyKernel); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM) REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, int, ops::MemcpyKernel, int64_t, ops::MemcpyKernel, bool, diff --git a/paddle/fluid/operators/metrics/accuracy_op_npu.cc b/paddle/fluid/operators/metrics/accuracy_op_npu.cc index 4ffcbaf55314a46888e15572e8477054b23ae2bb..f3cab995a08b611c64ec9e3abf9235da8a066eec 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_npu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_npu.cc @@ -23,91 +23,112 @@ template class AccuracyNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* pred = ctx.Input("Out"); + auto* inference = ctx.Input("Out"); auto* label = ctx.Input("Label"); - // auto* logits = ctx.Input("Indices"); + auto* indices = ctx.Input("Indices"); - auto* acc = ctx.Output("Accuracy"); + auto* accuracy = ctx.Output("Accuracy"); auto* correct = ctx.Output("Correct"); auto* total = ctx.Output("Total"); auto stream = ctx.template device_context() .stream(); - // cast pred - Tensor tmp_pred(pred->type()); - tmp_pred.Resize(pred->dims()); - tmp_pred.mutable_data(ctx.GetPlace()); - auto runner_cast_pred = - NpuOpRunner("Cast", {*pred}, {tmp_pred}, - {{"dst_type", static_cast(ACL_INT32)}}); - runner_cast_pred.Run(stream); - - // cast label - Tensor tmp_label(label->type()); - tmp_label.Resize(label->dims()); - tmp_label.mutable_data(ctx.GetPlace()); - auto runner_cast_label = - NpuOpRunner("Cast", {*label}, {tmp_label}, - {{"dst_type", static_cast(ACL_INT32)}}); - runner_cast_label.Run(stream); + int num_samples = inference->dims()[0]; + if (num_samples == 0) { + return; + } + + // cast `indices` or `label` if their type is not consistent + Tensor cast_indices(framework::proto::VarType::INT32); + Tensor cast_label(framework::proto::VarType::INT32); + if (indices->type() != label->type()) { + auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); + if (indices->type() != framework::proto::VarType::INT32) { + cast_indices.Resize(indices->dims()); + cast_indices.mutable_data(ctx.GetPlace()); + const auto& runner_cast_indices = + NpuOpRunner("Cast", {*indices}, {cast_indices}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_indices.Run(stream); + } else { + cast_indices.ShareDataWith(*indices); + } + if (label->type() != framework::proto::VarType::INT32) { + cast_label.Resize(label->dims()); + cast_label.mutable_data(ctx.GetPlace()); + const auto& runner_cast_label = + NpuOpRunner("Cast", {*label}, {cast_label}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_label.Run(stream); + } else { + cast_label.ShareDataWith(*label); + } + } else { + cast_indices.ShareDataWith(*indices); + cast_label.ShareDataWith(*label); + } // equal - Tensor tmp_equal(label->type()); - tmp_equal.Resize(label->dims()); + Tensor tmp_equal(framework::proto::VarType::BOOL); + tmp_equal.Resize(inference->dims()); tmp_equal.mutable_data(ctx.GetPlace()); - auto runner_equal = - NpuOpRunner("Equal", {tmp_pred, tmp_label}, {tmp_equal}, {}); + const auto& runner_equal = + NpuOpRunner("Equal", {cast_indices, cast_label}, {tmp_equal}, {}); runner_equal.Run(stream); // cast equal - Tensor tmp_equal_cast(label->type()); - tmp_equal_cast.Resize(label->dims()); + Tensor tmp_equal_cast(framework::proto::VarType::FP32); + tmp_equal_cast.Resize(inference->dims()); tmp_equal_cast.mutable_data(ctx.GetPlace()); - auto runner_cast_equal = - NpuOpRunner("Cast", {tmp_equal}, {tmp_equal_cast}, - {{"dst_type", static_cast(ACL_FLOAT)}}); + const auto& runner_cast_equal = NpuOpRunner( + "Cast", {tmp_equal}, {tmp_equal_cast}, + {{"dst_type", + static_cast(ConvertToNpuDtype(tmp_equal_cast.type()))}}); runner_cast_equal.Run(stream); - // acc - acc->mutable_data(ctx.GetPlace()); - std::vector axes_vec_1; - auto runner_acc = NpuOpRunner("ReduceMeanD", {tmp_equal_cast}, {*acc}, - {{"keep_dims", false}, {"axes", axes_vec_1}}); - runner_acc.Run(stream); - - // correct - correct->mutable_data(ctx.GetPlace()); - std::vector axes_vec_2; - auto runner_correct = - NpuOpRunner("ReduceSumD", {tmp_equal_cast}, {*correct}, - {{"keep_dims", false}, {"axes", axes_vec_2}}); - runner_correct.Run(stream); - - // ones_tensor - Tensor ones_tensor(label->type()); - ones_tensor.Resize(label->dims()); - ones_tensor.mutable_data(ctx.GetPlace()); - auto runner_oneslike = - NpuOpRunner("OnesLike", {tmp_label}, {ones_tensor}, {}); - runner_oneslike.Run(stream); - - // ones_tensor_cast - Tensor ones_tensor_cast(label->type()); - ones_tensor_cast.Resize(label->dims()); - ones_tensor_cast.mutable_data(ctx.GetPlace()); - auto runner_ones_cast = - NpuOpRunner("Cast", {ones_tensor}, {ones_tensor_cast}, - {{"dst_type", static_cast(ACL_FLOAT)}}); - runner_ones_cast.Run(stream); - - // total - total->mutable_data(ctx.GetPlace()); - std::vector axes_vec_3; - auto runner_total = - NpuOpRunner("ReduceSumD", {ones_tensor_cast}, {*total}, - {{"keep_dims", false}, {"axes", axes_vec_3}}); - runner_total.Run(stream); + // [correct] + // reduce_max + Tensor tmp_correct_max(framework::proto::VarType::FP32); + tmp_correct_max.Resize(framework::make_ddim({num_samples})); + tmp_correct_max.mutable_data(ctx.GetPlace()); + const auto& runner_reduce_max = + NpuOpRunner("ReduceMaxD", {tmp_equal_cast}, {tmp_correct_max}, + {{"axes", std::vector{1}}, {"keep_dims", false}}); + runner_reduce_max.Run(stream); + + // reduce_sum + Tensor tmp_correct(framework::proto::VarType::FP32); + tmp_correct.Resize(correct->dims()); + tmp_correct.mutable_data(ctx.GetPlace()); + const auto& runner_reduce_sum = + NpuOpRunner("ReduceSumD", {tmp_correct_max}, {tmp_correct}, + {{"axes", std::vector{0}}, {"keep_dims", false}}); + runner_reduce_sum.Run(stream); + + // cast to int + correct->mutable_data(ctx.GetPlace()); + const auto& runner_cast_correct = NpuOpRunner( + "Cast", {tmp_correct}, {*correct}, + {{"dst_type", static_cast(ConvertToNpuDtype(correct->type()))}}); + runner_cast_correct.Run(stream); + + // [total] + total->mutable_data(ctx.GetPlace()); + FillNpuTensorWithConstant(total, static_cast(num_samples)); + + // use `total` of type `float32` for calculating accuracy + Tensor tmp_total(framework::proto::VarType::FP32); + tmp_total.Resize(total->dims()); + tmp_total.mutable_data(ctx.GetPlace()); + FillNpuTensorWithConstant(&tmp_total, + static_cast(num_samples)); + + // [accuracy] + accuracy->mutable_data(ctx.GetPlace()); + const auto& runner_accuracy = + NpuOpRunner("Div", {tmp_correct, tmp_total}, {*accuracy}, {}); + runner_accuracy.Run(stream); } }; diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc index 5b14d4f6872439325fab505d7e1972e39fe737e3..743a61c744be711ce2e05e16c6e456127e69fc3f 100644 --- a/paddle/fluid/operators/minus_op.cc +++ b/paddle/fluid/operators/minus_op.cc @@ -146,3 +146,6 @@ REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradDescMaker, ops::MinusGradMaker); REGISTER_OP_CPU_KERNEL( minus, ops::MinusKernel); + +REGISTER_OP_CUDA_KERNEL( + minus, ops::MinusKernel); diff --git a/paddle/fluid/operators/minus_op.h b/paddle/fluid/operators/minus_op.h index 7791b1456a81516e48db645501c717d9c4cf8749..2300506c623ee2c5cbbeb502e80cf10838182a2a 100644 --- a/paddle/fluid/operators/minus_op.h +++ b/paddle/fluid/operators/minus_op.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -30,9 +31,10 @@ class MinusKernel : public framework::OpKernel { out_tensor->mutable_data(context.GetPlace()); auto& dev = *context.template device_context().eigen_device(); - framework::EigenVector::Flatten(*out_tensor).device(dev) = - framework::EigenVector::Flatten(*left_tensor) - - framework::EigenVector::Flatten(*right_tensor); + EigenSub, T>::Eval( + dev, framework::EigenVector::Flatten(*out_tensor), + framework::EigenVector::Flatten(*left_tensor), + framework::EigenVector::Flatten(*right_tensor)); } }; diff --git a/paddle/fluid/operators/mkldnn/CMakeLists.txt b/paddle/fluid/operators/mkldnn/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce95ec560c25e1ede3e029c755eb208a3a91e7a7 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(mkldnn_axpy_handler SRCS axpy_handler.cc DEPS place device_context enforce) diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 429a8b8456821f148804ac77ed8b388b2b2c45e9..177e539c4b6c294b23dfd10127b9606262d59f71 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -83,30 +83,11 @@ void eltwise_forward(const framework::ExecutionContext &ctx, const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } - - PADDLE_ENFORCE( - x->dims().size() >= 1 || x->dims().size() <= 6, - platform::errors::Unimplemented("Input dimension size can be 1, 2, 3, 4, " - "5, or 6, but now the dimension size is", - x->dims().size())); - bool is_inplaced = x->IsSharedBufferWith(*y); - auto src_tz = framework::vectorize(x->dims()); - auto src_format = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format(); - - platform::ActivationMKLDNNHandler handler( - src_tz, algorithm, alpha, beta, src_format, dev_ctx, ctx.GetPlace(), - ctx.InputName("X"), is_inplaced); + platform::ActivationMKLDNNHandler handler(algorithm, ctx, dev_ctx, + ctx.GetPlace(), x, + ctx.InputName("X"), is_inplaced); auto src_memory_p = handler.AcquireSrcMemory(x); auto dst_memory_p = is_inplaced ? src_memory_p : handler.AcquireDstMemory(y); @@ -130,28 +111,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; - float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; - - // paddle uses beta but mkldnn uses alpha for swish - if (algorithm == mkldnn::algorithm::eltwise_swish) { - std::swap(alpha, beta); - } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { - alpha = ctx.Attr("threshold"); - } - - auto diff_dst_tz = framework::vectorize(diff_y->dims()); - - // diff_dst and src dims should be the same - auto src_format = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format(); - - auto diff_y_format = - diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : diff_y->format(); - platform::ActivationMKLDNNHandler handler( - diff_dst_tz, algorithm, alpha, beta, src_format, diff_y_format, dev_ctx, - ctx.GetPlace(), ctx.InputName("X")); + algorithm, ctx, dev_ctx, ctx.GetPlace(), x, diff_y, ctx.InputName("X")); auto src_memory_p = handler.AcquireBackwardSrcMemory(x); auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc new file mode 100644 index 0000000000000000000000000000000000000000..76101f19ab618c8474ee5f1210a51f39c8f4955e --- /dev/null +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "mkldnn.hpp" +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { + +namespace plat = paddle::platform; + +namespace { + +template +class AXPYMKLDNNHandler : public plat::MKLDNNHandlerT { + public: + AXPYMKLDNNHandler(const plat::MKLDNNDeviceContext &dev_ctx, + const dnnl::engine mkldnn_engine, plat::Place cpu_place, + int n, float alpha) + : plat::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + plat::CreateKey(dev_ctx, static_cast(n), + plat::MKLDNNGetDataType(), alpha, "-axpy")), + alpha_(alpha), + n_(n) {} + + std::shared_ptr AcquireMemory(void *ptr, + const std::string &suffix) { + /*Generate key*/ + auto local_key = this->key_ + suffix; + auto mem_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + auto md = dnnl::memory::desc({n_}, plat::MKLDNNGetDataType(), + dnnl::memory::format_tag::x); + mem_p = std::make_shared(md, this->engine_, ptr); + this->dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + } + return mem_p; + } + + std::shared_ptr AcquireSrcMemory(const T *x) { + return this->AcquireMemory(plat::to_void_cast(x), "@user_src_mem_p"); + } + + std::shared_ptr AcquireDstMemory(T *y) { + return this->AcquireMemory(y, "@user_dst_mem_p"); + } + + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p) { + auto prim_key = this->key_ + "@reorder_p"; + auto reorder_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(prim_key)); + if (reorder_p == nullptr) { + // Here we pass Postops to mimick y -> a*X + y + dnnl::primitive_attr reorder_attr; + dnnl::post_ops post_operations; + if (this->alpha_ != 1.f) { + std::vector scales(1, this->alpha_); + reorder_attr.set_output_scales(0, scales); + } + post_operations.append_sum(1.0f); + + reorder_attr.set_post_ops(post_operations); + reorder_p = std::make_shared( + *(src_memory_p), *(dst_memory_p), reorder_attr); + this->dev_ctx_.SetBlob(prim_key, reorder_p); + } + return reorder_p; + } + + private: + float alpha_; + int n_; +}; + +template class AXPYMKLDNNHandler; +template class AXPYMKLDNNHandler; + +} // anonnymouse namespace + +template +static void naive_axpy(int n, T alpha, const T *x, T *y) { + while (n-- > 0) { + *y += alpha * *x; + ++y; + ++x; + } +} + +template +void onednn_handler_axpy(int n, T alpha, const T *x, T *y) { + // fallback to naive version + if (n < 100) { + naive_axpy(n, alpha, x, y); + return; + } + + auto &pool = plat::DeviceContextPool::Instance(); + auto cpu_place = plat::CPUPlace(); + auto *dev_ctx = + dynamic_cast(pool.Get(cpu_place)); + auto &cpu_engine = dev_ctx->GetEngine(); + + AXPYMKLDNNHandler handler(*dev_ctx, cpu_engine, cpu_place, n, + static_cast(alpha)); + + auto reorder_src_memory_p = handler.AcquireSrcMemory(x); + auto reorder_dst_memory_p = handler.AcquireDstMemory(y); + auto reorder_p = + handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); + + auto &astream = plat::MKLDNNDeviceContext::tls().get_stream(); + plat::RecordEvent record_reorder("axpy_int_reorder", + plat::EventRole::kUniqueOp); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); +} + +template void onednn_handler_axpy(int, float, const float *, float *); +template void onednn_handler_axpy(int, plat::bfloat16, + const plat::bfloat16 *, + plat::bfloat16 *); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.h b/paddle/fluid/operators/mkldnn/axpy_handler.h new file mode 100644 index 0000000000000000000000000000000000000000..8f0fdeb5c02b439e7e531af07728f8d047e32b7c --- /dev/null +++ b/paddle/fluid/operators/mkldnn/axpy_handler.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +namespace paddle { +namespace operators { + +/// +/// @brief Helper function to execute AXPY using oneDNN. +/// +/// @param[in] n The number of elements in tensor (assumed 1D) +/// @param[in] alpha The alpha coefficient. +/// @param[in] x The pointer to input X tensor. +/// @param y The pointer to output Y tensor. +/// +/// @tparam T Data type. +/// +template +void onednn_handler_axpy(int n, T alpha, const T *x, T *y); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index 75367ba0573209338b3ba85ab2ac7240f07d58d3..99b8d020436fc1418bd8877dd1fd640ae0bb3994 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -85,24 +85,54 @@ class BatchNormMKLDNNHandler md, epsilon, flags); } } - BatchNormMKLDNNHandler(const std::vector &dims, const float &epsilon, - const mkldnn::normalization_flags &flags, - const MKLDNNMemoryFormat diff_fmt, - const MKLDNNMemoryFormat src_fmt, + + BatchNormMKLDNNHandler(const paddle::framework::ExecutionContext &ctx, const platform::MKLDNNDeviceContext &dev_ctx, - platform::Place cpu_place, - const std::string &uniq_name) + platform::Place cpu_place, const Tensor *in_x, + const Tensor *scale, const Tensor *out_grad, + const std::string &unique_name) : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, uniq_name)) { - auto diff_dst_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), src_fmt); - - this->AcquireBackwardPrimitiveDescriptor( - mkldnn::prop_kind::backward, diff_dst_md, src_md, epsilon, flags); + platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), + unique_name)) { + if (!this->isBwdCached()) { + PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Input out_grad tensor")); + PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Input out_grad tensor")); + + auto src_tz = paddle::framework::vectorize(in_x->dims()); + auto scale_tz = paddle::framework::vectorize(scale->dims()); + PADDLE_ENFORCE_EQ( + scale_tz.size(), 1, + platform::errors::InvalidArgument( + "Dims of scale tensor must be 1, but received scale's size is %d", + scale_tz.size())); + + MKLDNNMemoryFormat diff_fmt = + platform::MKLDNNFormatForSize(src_tz.size(), out_grad->format()); + + MKLDNNMemoryFormat src_fmt = + platform::MKLDNNFormatForSize(src_tz.size(), in_x->format()); + + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = mkldnn::memory::desc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = + mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), src_fmt); + + const float epsilon = ctx.Attr("epsilon"); + + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, src_md, epsilon, + mkldnn::normalization_flags::use_scale_shift); + this->AcquireBackwardPrimitiveDescriptor( + mkldnn::prop_kind::backward, diff_dst_md, src_md, epsilon, + mkldnn::normalization_flags::use_scale_shift); + } } std::shared_ptr AcquireScaleShiftMemory(const Tensor *scale, @@ -263,8 +293,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto &dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); - const float epsilon = ctx.Attr("epsilon"); - const auto *x = ctx.Input("X"); const auto *scale = ctx.Input("Scale"); const auto *shift = ctx.Input("Bias"); @@ -275,35 +303,11 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); - PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "Wrong layout set for Input diff_y tensor")); - PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for Input diff_y tensor")); - - auto src_tz = paddle::framework::vectorize(x->dims()); - auto scale_tz = paddle::framework::vectorize(scale->dims()); - PADDLE_ENFORCE_EQ( - scale_tz.size(), 1, - platform::errors::InvalidArgument( - "Dims of scale tensor must be 1, but received scale's size is %d", - scale_tz.size())); - - const unsigned int C = scale_tz[0]; - - MKLDNNMemoryFormat dst_format = - platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); - - MKLDNNMemoryFormat input_format = - platform::MKLDNNFormatForSize(src_tz.size(), x->format()); - - BatchNormMKLDNNHandler handler( - src_tz, epsilon, mkldnn::normalization_flags::use_scale_shift, - dst_format, input_format, dev_ctx, ctx.GetPlace(), - ctx.InputName("SavedMean")); + BatchNormMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), x, scale, + diff_y, ctx.InputName("SavedMean")); // MKLDNN requires a single piece of memory for scale and shift/bias data + const unsigned int C = paddle::framework::vectorize(scale->dims())[0]; const size_t scaleshift_size = 2 * C; std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); @@ -335,7 +339,7 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); - // copy back diff sacle/shift to output tensors (diff scale/shift) + // copy back diff scale/shift to output tensors (diff scale/shift) diff_scaleshift_data.resize(scaleshift_size); auto it = std::begin(diff_scaleshift_data); std::copy(it, std::next(it, C), diff_scale_data); diff --git a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9cfeace6bef99f98fcaa79dae5ba2ff1885092aa --- /dev/null +++ b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc @@ -0,0 +1,73 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class CastMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + int in_dtype = ctx.Attr("in_dtype"); + int out_dtype = ctx.Attr("out_dtype"); + + auto x_paddle_type = framework::proto::VarType::Type(in_dtype); + auto out_paddle_type = framework::proto::VarType::Type(out_dtype); + + mkldnn::memory::data_type x_type = + framework::ToMKLDNNDataType(x_paddle_type); + mkldnn::memory::data_type out_type = + framework::ToMKLDNNDataType(out_paddle_type); + + auto x_tz = framework::vectorize(x->dims()); + + std::string key = + platform::CreateKey(dev_ctx, x_tz, x->format(), x->format(), x_type); + platform::ReorderMKLDNNHandler reorder_handler( + x_tz, x_paddle_type, x_type, out_paddle_type, out_type, dev_ctx, + dev_ctx.GetEngine(), key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->format(), platform::to_void_cast(x->data())); + auto reorder_dst_memory_p = + reorder_handler.AcquireDstMemory(out, x->format(), dev_ctx.GetPlace()); + auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p, + reorder_src_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(cast, MKLDNN, paddle::platform::CPUPlace, + ops::CastMKLDNNKernel, + ops::CastMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index df1b5af121da939ad818d0dacfb8f62a6464cac8..df4750321e3fced1b1b756d648672d7f07baba11 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" @@ -156,6 +157,17 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { "The axis is expected to be in range of [%d, %d), but got %d", -rank, rank, concat_axis)); platform::MKLDNNDeviceContext::tls().log_lib_version(); + + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + concat_axis = GetDataFromTensor(axis_tensor)[0]; + auto out_dims = multi_input[0]->dims(); + for (size_t i = 1; i < multi_input.size(); ++i) { + out_dims[concat_axis] += multi_input[i]->dims()[concat_axis]; + } + output->Resize(out_dims); + } + if (concat_axis < 0) { concat_axis = concat_axis + rank; } diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 73530eac09e99c695ad8185d694ee9e7a4ed4396..0065f3ae39483236622fb13b95ab8b6a14ca4095 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -74,7 +74,9 @@ static mkldnn::memory::data_type GetDstType(bool is_int8, bool is_bfloat16, template class ConvMKLDNNHandlerT - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerT { public: ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, @@ -82,7 +84,9 @@ class ConvMKLDNNHandlerT platform::Place cpu_place, const Tensor* input, const Tensor* filter, const Tensor* bias, Tensor* output, const std::string& unique_name) - : platform::MKLDNNHandlerT( + : platform::MKLDNNHandlerT( dev_ctx, mkldnn_engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), unique_name)) { @@ -237,6 +241,142 @@ class ConvMKLDNNHandlerT } } + ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const Tensor* in, + const Tensor* filter, const Tensor* bias, + const Tensor* out_grad, Tensor* filter_grad, + Tensor* in_x_grad, const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(in->dims()), + unique_name)) { + if (!this->isBwdCached()) { + PADDLE_ENFORCE_EQ( + in->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "The input tensor's layout should be %d, but got %d.", + DataLayout::kMKLDNN, in->layout())); + PADDLE_ENFORCE_NE(in->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Got wrong format for Input tensor.")); + + PADDLE_ENFORCE_EQ( + filter->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "The filter tensor's layout should be %d, but got %d.", + DataLayout::kMKLDNN, filter->layout())); + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Got wrong format for Filter tensor.")); + + PADDLE_ENFORCE_EQ( + out_grad->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "The output_grad tensor's layout should be %d, but got %d.", + DataLayout::kMKLDNN, out_grad->layout())); + PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for output_grad tensor")); + + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, + platform::errors::InvalidArgument( + "is_test attribute should be set to False in training phase.")); + + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + std::vector dilations_temp = ctx.Attr>("dilations"); + std::vector dilations(begin(dilations_temp), + end(dilations_temp)); + + std::string padding_algorithm = + ctx.Attr("padding_algorithm"); + + int groups = ctx.Attr("groups"); + + auto input_dims = in->dims(); + auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); + auto filter_dims = filter->dims(); + auto filter_data_dims = + framework::slice_ddim(filter_dims, 2, filter_dims.size()); + + auto ksize = framework::vectorize(filter_data_dims); + + UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, + data_dims, strides, ksize); + + auto src_tz = framework::vectorize(in->dims()); + auto weights_tz = framework::vectorize(filter->dims()); + + int g = std::max(groups, 1); + platform::GetGroupConvWeightsTz(weights_tz, g); + auto dst_tz = paddle::framework::vectorize(out_grad->dims()); + + /* create memory descriptor for conv backward without specified format + * ('any') which lets a primitive (conv backward in this case) choose + * the memory format preferred for best performance + */ + const auto chosen_memory_format = MKLDNNMemoryFormat::any; + const auto weights_format = MKLDNNMemoryFormat::any; + + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + const auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + auto diff_src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), weights_format); + auto diff_weights_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), weights_format); + auto diff_dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + + auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); + std::transform(dilations.begin(), dilations.end(), dilations.begin(), + [](int64_t i) { return i - 1; }); + const mkldnn::memory::dims dilations_dims = dilations; + + const mkldnn::memory::dims stride_dims = strides; + // Recreating FWD PD. For training there are no post ops in convolution + mkldnn::primitive_attr conv_attr; + if (bias) { + auto bias_tz = framework::vectorize(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + + this->AcquireForwardPrimitiveDescriptor( + conv_attr, mkldnn::prop_kind::forward_training, + dnnl::algorithm::convolution_direct, src_md, weights_md, bias_md, + dst_md, stride_dims, dilations_dims, mkldnn_paddings[0], + mkldnn_paddings[1]); + } else { + this->AcquireForwardPrimitiveDescriptor( + conv_attr, mkldnn::prop_kind::forward_training, + dnnl::algorithm::convolution_direct, src_md, weights_md, dst_md, + stride_dims, dilations_dims, mkldnn_paddings[0], + mkldnn_paddings[1]); + } + + this->AcquireBackwardPrimitiveDescriptor( + mkldnn::algorithm::convolution_direct, diff_src_md, weights_md, + diff_dst_md, strides, dilations_dims, mkldnn_paddings[0], + mkldnn_paddings[1]); + + this->AcquireBackwardWeightsPrimitiveDescriptor( + mkldnn::algorithm::convolution_direct, src_md, diff_weights_md, + diff_dst_md, strides, dilations_dims, mkldnn_paddings[0], + mkldnn_paddings[1]); + } + } + mkldnn::primitive_attr CreatePostOps( std::string fuse_activation, float fuse_alpha, float fuse_beta, bool fuse_residual_conn, const std::vector output_shift_scale = {}, @@ -280,27 +420,75 @@ class ConvMKLDNNHandlerT return conv_attr; } + std::shared_ptr + AcquireWeightsMemoryWithReorderFromDataPrimitive( + const framework::Tensor* filter, const int groups, const bool is_conv3d) { + const K* filter_data = filter->data(); + auto weights_tz = framework::vectorize(filter->dims()); + platform::GetGroupConvWeightsTz(weights_tz, groups); + + auto user_src_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), + GetWeightsFormat(filter->format(), groups, is_conv3d)); + + return this->AcquireMemoryWithReorder( + user_src_md, this->bwd_pd_->weights_desc(), + to_void_cast(filter_data), "@weights_mem_d_p", false); + } + std::shared_ptr AcquireSrcMemoryWithReorder( const framework::Tensor* input) { - const T* input_data = input->data(); - const std::string user_key_suffix{"@src_mem_p_user"}; - auto user_src_mem_p = this->AcquireMemory(user_key_suffix); + return this->AcquireMemoryWithReorderPrimitive( + input, "@src_mem_p_user", "@src_mem_p_target", "@src_mem_p", + this->fwd_pd_->src_desc()); + } - if (!user_src_mem_p) { - auto user_src_md = platform::MKLDNNMemDesc( - framework::vectorize(input->dims()), platform::MKLDNNGetDataType(), - input->format()); + std::shared_ptr + AcquireSrcMemoryWithReorderFromWeightsPrimitive( + const framework::Tensor* input) { + return this->AcquireMemoryWithReorderPrimitive( + input, "@src_mem_w_p_user", "@src_mem_w_p_target", "@src_mem_w_p", + this->bwd_w_pd_->src_desc()); + } + + std::shared_ptr + AcquireDiffDstMemoryWithReorderFromWeightsPrimitive( + const framework::Tensor* out_grad) { + return this->AcquireMemoryWithReorderPrimitive( + out_grad, "@diff_dst_mem_w_p_user", "@diff_dst_mem_w_p_target", + "@diff_dst_mem_w_p", this->bwd_w_pd_->diff_dst_desc()); + } + + std::shared_ptr + AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive( + const framework::Tensor* out_grad) { + return this->AcquireMemoryWithReorderPrimitive( + out_grad, "@diff_dst_mem_p_user", "@diff_dst_mem_p_target", + "@diff_dst_mem_p", this->bwd_pd_->diff_dst_desc()); + } + + std::shared_ptr AcquireMemoryWithReorderPrimitive( + const framework::Tensor* in_mem, const char* key_mem_user, + const char* key_mem_target, const char* key_mem, + const mkldnn::memory::desc& mem_md) { + const T* in_mem_data = in_mem->data(); + const std::string user_key_suffix{key_mem_user}; + auto user_mem_p = this->AcquireMemory(user_key_suffix); + + if (!user_mem_p) { + auto user_mem_md = platform::MKLDNNMemDesc( + framework::vectorize(in_mem->dims()), + platform::MKLDNNGetDataType(), in_mem->format()); return this->AcquireMemoryWithReorder( - user_src_md, this->fwd_pd_->src_desc(), to_void_cast(input_data), - "@src_mem_p"); + user_mem_md, mem_md, to_void_cast(in_mem_data), key_mem); } else { - const std::string target_key_suffix{"@src_mem_p_target"}; - const auto target_src_mem_p = this->AcquireMemory(target_key_suffix); - user_src_mem_p->set_data_handle(to_void_cast(input_data)); - if (user_src_mem_p != target_src_mem_p) { - this->AcquireReorder(user_src_mem_p, target_src_mem_p, "@src_mem_p"); + const std::string target_key_suffix{key_mem_target}; + const auto target_mem_p = this->AcquireMemory(target_key_suffix); + user_mem_p->set_data_handle(to_void_cast(in_mem_data)); + if (user_mem_p != target_mem_p) { + this->AcquireReorder(user_mem_p, target_mem_p, key_mem); } - return target_src_mem_p; + return target_mem_p; } } @@ -866,7 +1054,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } }; -template +template class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { @@ -879,189 +1067,44 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* input = ctx.Input("Input"); const Tensor* filter = ctx.Input("Filter"); + const Tensor* bias = + ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; const Tensor* output_grad = ctx.Input(framework::GradVarName("Output")); Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor.")); - - PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Filter tensor.")); - - PADDLE_ENFORCE_EQ( - output_grad->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The output_grad tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, output_grad->layout())); - PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Wrong format set for output_grad tensor")); - - PADDLE_ENFORCE_EQ( - ctx.Attr("is_test"), false, - platform::errors::InvalidArgument( - "is_test attribute should be set to False in training phase.")); - if (!input_grad && !filter_grad) return; - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - std::vector dilations(begin(dilations_temp), end(dilations_temp)); - - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - int groups = ctx.Attr("groups"); - - bool is_conv3d = strides.size() == 3U; - const T* input_data = input->data(); - const T* filter_data = filter->data(); - const T* output_grad_data = output_grad->data(); - T* input_grad_data = nullptr; - T* filter_grad_data = nullptr; - - auto input_dims = input->dims(); - auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); - auto filter_dims = filter->dims(); - auto filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); - - auto ksize = framework::vectorize(filter_data_dims); - - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - data_dims, strides, ksize); - - auto src_tz = paddle::framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); - - int g = std::max(groups, 1); - platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(output_grad->dims()); - - auto src_format = input->format(); - MKLDNNMemoryFormat weights_format = - GetWeightsFormat(filter->format(), g, is_conv3d); - - // Get an unique name from "argument" name of "input" and "Filter" variable - // as well as attributes of primitive to be created - // This name will be used as key when saving info into device context - std::string key = platform::CreateKey( - dev_ctx, src_tz, ctx.InputName("Input") + ctx.InputName("Filter")); - - const std::string key_conv_pd = key + "@fwd_pd"; - key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - std::vector pipeline; - - // Create user memory descriptors - auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), src_format); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), weights_format); - auto user_diff_dst_md = platform::MKLDNNMemDesc( - {dst_tz}, platform::MKLDNNGetDataType(), output_grad->format()); - - /* create memory descriptor for conv backward without specified format - * ('any') which lets a primitive (conv backward in this case) choose - * the memory format preferred for best performance - */ - auto chosen_memory_format = MKLDNNMemoryFormat::any; - weights_format = MKLDNNMemoryFormat::any; - - auto src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - auto diff_src_md = platform::MKLDNNMemDesc( - src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), weights_format); - auto diff_weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), weights_format); - auto diff_dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - // Retrieve conv_pd from device context - auto conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); - PADDLE_ENFORCE_NE(conv_pd, nullptr, - platform::errors::InvalidArgument( - "Fail to find conv_pd in device context")); - - auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); - std::transform(dilations.begin(), dilations.end(), dilations.begin(), - [](int64_t i) { return i - 1; }); - const mkldnn::memory::dims dilations_dims = dilations; - // create backward convolution weights primitive descriptor - auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( - mkldnn::algorithm::convolution_direct, src_md, diff_weights_md, - diff_dst_md, strides, dilations_dims, mkldnn_paddings[0], - mkldnn_paddings[1]); - - auto conv_bwd_weights_pd = - std::make_shared( - conv_bwd_weights_desc, mkldnn_engine, *conv_pd); - - // create backward convolution data primitive descriptor - auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( - mkldnn::algorithm::convolution_direct, diff_src_md, weights_md, - diff_dst_md, strides, dilations_dims, mkldnn_paddings[0], - mkldnn_paddings[1]); - - auto conv_bwd_data_pd = - std::make_shared( - conv_bwd_data_desc, mkldnn_engine, *conv_pd); - - platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, - conv_bwd_weights_pd, dev_ctx, - mkldnn_engine, key); + // TODO(jczaja): Are all tensors really needed? + ConvMKLDNNHandlerT handler( + ctx, dev_ctx, ctx.GetPlace(), input, filter, bias, output_grad, + filter_grad, input_grad, + ctx.InputName("Input") + ctx.InputName("Filter")); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = - handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - auto user_diff_dst_memory_p = handler.AcquireDiffDstMemory( - user_diff_dst_md, to_void_cast(output_grad_data)); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - if (filter_grad) { - auto src_memory_p = handler.AcquireSrcMemoryFromWeightsPrimitive( - user_src_memory_p, pipeline); - - auto diff_dst_memory_4filter_p = - handler.AcquireDiffDstMemoryFromWeightsPrimitive( - user_diff_dst_memory_p, pipeline); - const size_t size = handler.GetDiffWeightsMemorySize(); - filter_grad_data = filter_grad->mutable_data(ctx.GetPlace(), size); + if (filter_grad) { + auto src_memory_p = + handler.AcquireSrcMemoryWithReorderFromWeightsPrimitive(input); + auto diff_dst_memory_p = + handler.AcquireDiffDstMemoryWithReorderFromWeightsPrimitive( + output_grad); // For convoluition with groups write filter grad into // oneDNN buffer and then we reorder it into filter_grad tensor + int g = std::max(ctx.Attr("groups"), 1); auto diff_weights_memory_p = - g > 1 ? handler.AcquireDiffWeightsMemoryFromWeightsPrimitive() - : handler.AcquireDiffWeightsMemoryFromWeightsPrimitive( - reinterpret_cast(filter_grad_data)); + g > 1 ? handler.AcquireDiffWeightsMemory() + : handler.AcquireDiffWeightsMemory(filter_grad); - auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights(); + auto conv_bwd_weights_p = handler.AcquireBackwardWeightsPrimitive(); // TODO(grygielski) why no bias_diff? conv_bwd_weights_p->execute( astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_4filter_p}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); astream.wait(); @@ -1073,10 +1116,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // For convolution with groups convert from blocked to NCHW // otherwise there will be problems in next operators working on this data if (g > 1) { - memory::data_type in_type = - framework::ToMKLDNNDataType(filter_grad->type()); + memory::data_type in_type = framework::ToMKLDNNDataType(filter->type()); // for 3d conv with groups (six dimensional data reorder to goidhw) // for 2d conv with groups (five dimensional data reorder to goihw) + // auto weights_tz = paddle::framework::vectorize(filter->dims()); + + auto weights_tz = diff_weights_memory_p->get_desc().dims(); mkldnn::memory::format_tag out_format = weights_tz.size() == 6 ? mkldnn::memory::format_tag::goidhw : mkldnn::memory::format_tag::goihw; @@ -1084,9 +1129,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { out_format, in_type); key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - platform::ReorderMKLDNNHandler handler(weights_tz, filter_grad->type(), - in_type, dev_ctx, mkldnn_engine, - key); + platform::ReorderMKLDNNHandler handler( + weights_tz, filter->type(), in_type, dev_ctx, mkldnn_engine, key); auto reorder_dst_memory_p = handler.AcquireDstMemory(filter_grad, out_format, ctx.GetPlace()); @@ -1113,24 +1157,21 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { } } if (input_grad) { - auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive( - user_weights_memory_p, pipeline); - - auto diff_dst_memory_4data_p = - handler.AcquireDiffDstMemoryFromDataPrimitive(user_diff_dst_memory_p, - pipeline); - - const size_t size = handler.GetDiffSourceMemorySize(); - input_grad_data = input_grad->mutable_data(ctx.GetPlace(), size); + auto weights_memory_p = + handler.AcquireWeightsMemoryWithReorderFromDataPrimitive( + filter, ctx.Attr("groups"), + ctx.Attr>("strides").size() == 3U); - auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive( - reinterpret_cast(input_grad_data)); + auto diff_dst_memory_p = + handler.AcquireDiffDstMemoryWithReorderMemoryFromDataPrimitive( + output_grad); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(input_grad); - auto conv_bwd_data_p = handler.AcquireConvolutionBackwardData(); + auto conv_bwd_data_p = handler.AcquireBackwardPrimitive(); conv_bwd_data_p->execute(astream, {{MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_4data_p}, + {MKLDNN_ARG_DIFF_DST, *diff_dst_memory_p}, {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); astream.wait(); @@ -1167,7 +1208,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNGradOpKernel); + ops::ConvMKLDNNGradOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, ::paddle::platform::CPUPlace, FP32, @@ -1177,4 +1218,4 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNGradOpKernel); + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index e2e9d280027b6a30958b308429cbb21d61fb2c08..5b563e666af0aaa7034594de18fbb69813a93195 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -14,21 +14,104 @@ limitations under the License. */ #include "paddle/fluid/platform/mkldnn_reuse.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { using paddle::framework::Tensor; using paddle::platform::MKLDNNDeviceContext; +template +class LRNMKLDNNHandler : public platform::MKLDNNHandlerT { + public: + LRNMKLDNNHandler(const framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, const Tensor* input, + const std::string& unique_name) + + : platform::MKLDNNHandlerT( + dev_ctx, mkldnn_engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), + unique_name)) { + if (!this->isCached()) { + const int n = ctx.Attr("n"); + // MKL-DNN implements LRN in a caffe way: + // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html + // Where sum of squares is divided by size of normalization window + // this is not the case for PaddlePaddle LRN. + // Hence we need to compensate for this diffrence by + // multipliing alpha by size of window(n) + const float alpha = ctx.Attr("alpha") * static_cast(n); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + bool is_test = ctx.Attr("is_test"); + + auto dims = framework::vectorize(input->dims()); + + auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + input->format()); + + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); + } + } + + LRNMKLDNNHandler(const framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const Tensor* in_x, + const Tensor* out_grad, Tensor* in_x_grad, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), + unique_name)) { + if (!this->isBwdCached()) { + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, + platform::errors::PreconditionNotMet( + "is_test attribute should be set to False in training phase.")); + + const int n = ctx.Attr("n"); + const float alpha = ctx.Attr("alpha") * static_cast(n); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + + auto dims = framework::vectorize(in_x->dims()); + + auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), + in_x->format()); + auto diff_md = mkldnn::memory::desc( + dims, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, + mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); + + this->AcquireBackwardPrimitiveDescriptor( + mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, + beta, k); + } + } + + std::shared_ptr AcquireWorkspaceMemory(Tensor* workspace) { + T* ptr = workspace->mutable_data( + this->place_, this->fwd_pd_->workspace_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), + ptr, "@wrk_mem_p"); + } + + std::shared_ptr AcquireBackwardWorkspaceMemory( + const Tensor* workspace) { + const T* workspace_data = workspace->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->workspace_desc(), + platform::to_void_cast(workspace_data), "@bwd-wrk_mem_p"); + } +}; + template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -48,8 +131,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto out = ctx.Output("Out"); auto mid = ctx.Output("MidOut"); - platform::LRNMKLDNNHandler handler( - ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, ctx.OutputName("Out")); + LRNMKLDNNHandler handler(ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, + ctx.OutputName("Out")); auto src_memory = handler.AcquireSrcMemory(x); auto dst_memory = handler.AcquireDstMemory(out); @@ -87,34 +170,22 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, paddle::platform::errors::PreconditionNotMet( "Operator DNNL LRNGrad must use CPUPlace")); - PADDLE_ENFORCE_EQ( - ctx.Attr("is_test"), false, - platform::errors::PreconditionNotMet( - "is_test attribute should be set to False in training phase.")); - auto x = ctx.Input("X"); + auto in_x = ctx.Input("X"); auto mid = ctx.Input("MidOut"); auto out_grad = ctx.Input(framework::GradVarName("Out")); - auto x_grad = ctx.Output(framework::GradVarName("X")); - - const int n = ctx.Attr("n"); - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); + auto in_x_grad = ctx.Output(framework::GradVarName("X")); auto& dev_ctx = ctx.template device_context(); - auto dims = paddle::framework::vectorize(x->dims()); + LRNMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), in_x, out_grad, + in_x_grad, ctx.InputName("Out")); - platform::LRNMKLDNNHandler handler(dims, n, alpha, beta, k, x->format(), - out_grad->format(), dev_ctx, - ctx.GetPlace(), ctx.InputName("Out")); - - auto src_memory = handler.AcquireSrcMemory(x); + auto src_memory = handler.AcquireSrcMemory(in_x); auto workspace = handler.AcquireBackwardWorkspaceMemory(mid); auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad); - auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad); + auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad); auto lrn_bwd = handler.AcquireBackwardPrimitive(); @@ -125,8 +196,8 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_WORKSPACE, *workspace}}); astream.wait(); - x_grad->set_layout(framework::DataLayout::kMKLDNN); - x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory)); + in_x_grad->set_layout(framework::DataLayout::kMKLDNN); + in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory)); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc index 3ef9d88e4e91e17eb9fbaeac1bcbed53a1bac09e..2b3496359b0c66cccc40ed676cfc462d8148a11c 100644 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace platform { @@ -37,6 +37,111 @@ using platform::MKLDNNGetDataType; using platform::to_void_cast; using Tensor = framework::Tensor; +// Reshape a rank-3 tensor from P x M x N to (P * M) x N. +// Identity op if the tensor is not of rank 3. +static framework::Tensor FoldOuterDims(const Tensor& input) { + auto output = input; + auto in_dims = input.dims(); + if (in_dims.size() == 3) { + output.Resize({in_dims[0] * in_dims[1], in_dims[2]}); + } + return output; +} + +// Reshape a rank-3 tensor from P x M x N to M x (P * N). +// (Warning: This requires transposing data and writes into new memory.) +// Identity op if the tensor is not of rank 3. +template +static framework::Tensor FoldFirstAndLastDims( + const MKLDNNDeviceContext& dev_ctx, const Tensor* input) { + auto input_dims = framework::vectorize(input->dims()); + if (input_dims.size() != 3) { + return *input; + } + + framework::Tensor output; + output.Resize({input_dims[1], input_dims[0], input_dims[2]}); + + auto output_dims = framework::vectorize(output.dims()); + + memory::data_type input_type = framework::ToMKLDNNDataType(input->type()); + std::string key = platform::CreateKey(dev_ctx, input_dims, input->format(), + input->format(), input_type); + platform::ReorderMKLDNNHandler reorder_handler(output_dims, input->type(), + input_type, dev_ctx, + dev_ctx.GetEngine(), key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + memory::format_tag::abc, platform::to_void_cast(input->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + &output, memory::format_tag::bac, dev_ctx.GetPlace()); + auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p, + reorder_dst_memory_p); + + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + astream.wait(); + + output.Resize({input_dims[1], input_dims[0] * input_dims[2]}); + return output; +} + +template +class MatMulMKLDNNHandler : public platform::MKLDNNHandlerT { + public: + MatMulMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine engine, platform::Place cpu_place, + Tensor* x, bool trans_x, Tensor* y, bool trans_y, + Tensor* out, float scale, const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), + uniq_name)) { + if (!this->isCached()) { + auto mat_dim_x = math::CreateMatrixDescriptor(x->dims(), 0, trans_x); + auto mat_dim_y = math::CreateMatrixDescriptor(y->dims(), 0, trans_y); + + memory::dim x_bs = mat_dim_x.batch_size_; + memory::dim y_bs = mat_dim_y.batch_size_; + + memory::dim out_bs = x_bs || y_bs ? std::max(x_bs, y_bs) : 1; + const memory::dim M = mat_dim_x.height_; + const memory::dim N = mat_dim_y.width_; + const memory::dim K = mat_dim_x.width_; + + memory::dims x_dims = {x_bs > 0 ? x_bs : 1, M, K}; + memory::dims y_dims = {y_bs > 0 ? y_bs : 1, K, N}; + memory::dims out_dims = {out_bs, M, N}; + + memory::dims x_strides = + !trans_x ? memory::dims{M * K, K, 1} : memory::dims{M * K, 1, M}; + + memory::dims y_strides = + !trans_y ? memory::dims{N * K, N, 1} : memory::dims{N * K, 1, K}; + memory::dims out_strides = memory::dims{M * N, N, 1}; + + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = memory::desc(out_dims, MKLDNNGetDataType(), out_strides); + + dnnl::primitive_attr attrs; + if (scale != 1.0f) attrs.set_output_scales(0, {scale}); + + this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md); + } + } + + std::shared_ptr AcquireWeightsMemory(const Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data), + "@weights_mem_p"); + } +}; + template constexpr bool IsInt8() { return std::is_same::value || std::is_same::value; @@ -44,7 +149,7 @@ constexpr bool IsInt8() { template constexpr bool IsBfloat16() { - return std::is_same::value; + return std::is_same::value; } // Get row matrix shape from a vector shape. If the rank of x_dim > 1, the @@ -60,6 +165,60 @@ static framework::DDim ColumnMatrixDimsFromVector( return y_dim.size() > 1 ? y_dim : framework::make_ddim({y_dim[0], 1}); } +/** + * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor. + * + * The shape would be [BatchSize, H, W] or [H, W]. + * If transposed, `H,W` will be swapped. + */ +static void ReshapeTensorToMatrixSequence( + framework::Tensor* x, const math::MatDescriptor& descriptor) { + int64_t h, w; + h = descriptor.height_; + w = descriptor.width_; + if (descriptor.trans_) { + std::swap(w, h); + } + if (descriptor.batch_size_) { + x->Resize({descriptor.batch_size_, h, w}); + } else { + x->Resize({h, w}); + } +} + +/** + * Reshape the x,y,out tensor to 3-D or 2-D tensor by matrix descriptor + * Out = matmul(x, y) + * + * This method will first calculate X,Y matrix sequence, and then calculate + * the out shape. + * + * Assume X = [BatchSize, H1, W1], Y = [BatchSize, H2, W2] + * The out = [BatchSize, H1, W2] + * + * If there is no batch size in `X` and `Y`, the out will be [H1, W2] + * If any of `X` and `Y` has batch size BatchSize, the out will have the + * BatchSize. + */ +static void ReshapeXYOutToMatrixSequence(framework::Tensor* x, + framework::Tensor* y, + framework::Tensor* out, bool trans_x, + bool trans_y) { + auto x_dim = RowMatrixDimsFromVector(x->dims()); + auto y_dim = ColumnMatrixDimsFromVector(y->dims()); + auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x); + auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y); + if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { + out->Resize({mat_dim_x.height_, mat_dim_y.width_}); + } else { + out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), + mat_dim_x.height_, mat_dim_y.width_}); + } + + ReshapeTensorToMatrixSequence(x, mat_dim_x); + ReshapeTensorToMatrixSequence(y, mat_dim_y); +} + template class MatMulFactory { public: @@ -372,7 +531,7 @@ static void ExecuteMatMul(const ExecutionContext& ctx) { template class DNNLMatMulKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const ExecutionContext& ctx) const override { if (ctx.HasAttr("head_number")) { PADDLE_ENFORCE_EQ( ctx.Attr("head_number"), 1, @@ -385,6 +544,137 @@ class DNNLMatMulKernel : public framework::OpKernel { ExecuteMatMul(ctx); } }; + +template +class MatMulGradMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { + if (ctx.HasAttr("head_number")) { + PADDLE_ENFORCE_EQ( + ctx.Attr("head_number"), 1, + platform::errors::Unimplemented( + "DNNL matmul doesn't support multiple heads. Expected " + "head_number=1. But received `head_number` is %d", + ctx.Attr("head_number"))); + } + RunKernel(ctx); + } + + private: + void ExecuteMatMulGrad(const ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine& engine, Tensor* x, bool trans_x, + bool is_fold_init_dims_x, Tensor* y, bool trans_y, + bool is_fold_init_dims_y, Tensor* out, + int execution_number) const { + // gradient is calculated in a different way when broadcasting is used + bool need_combine = (x->dims().size() == 3 || y->dims().size() == 3) && + out->dims().size() == 2; + + Tensor x_combined, y_combined; + if (!need_combine) { + x_combined = *x; + y_combined = *y; + } else { + x_combined = is_fold_init_dims_x ? FoldOuterDims(*x) + : FoldFirstAndLastDims(dev_ctx, x); + y_combined = is_fold_init_dims_y ? FoldOuterDims(*y) + : FoldFirstAndLastDims(dev_ctx, y); + } + + MatMulMKLDNNHandler handler( + dev_ctx, engine, ctx.GetPlace(), &x_combined, trans_x, &y_combined, + trans_y, out, ctx.Attr("alpha"), + ctx.InputName(framework::GradVarName("Out")) + + std::to_string(execution_number)); + + const auto src_memory_p = handler.AcquireSrcMemory(&x_combined); + const auto weights_memory_p = handler.AcquireWeightsMemory(&y_combined); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory_p->get_desc().reshape( + framework::vectorize(out->dims())))); + } + + template + void RunKernel(const ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto x = *ctx.Input("X"); + auto y = *ctx.Input("Y"); + auto dout = *ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + bool transpose_x = ctx.Attr("transpose_X"); + bool transpose_y = ctx.Attr("transpose_Y"); + + ReshapeXYOutToMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + framework::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); + } + } + + framework::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); + } + } + + if (transpose_x && transpose_y) { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, true, true, + &dout, true, false, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true, + &x, true, false, dy, 1); + } else if (transpose_x) { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &y, false, false, + &dout, true, false, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, false, false, + &dout, false, true, dy, 1); + } else if (transpose_y) { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false, + &y, false, true, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, true, true, + &x, false, true, dy, 1); + } else { + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &dout, false, false, + &y, true, false, dx, 0); + this->ExecuteMatMulGrad(ctx, dev_ctx, onednn_engine, &x, true, true, + &dout, false, true, dy, 1); + } + + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); + } + } + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); + } + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; @@ -394,3 +684,7 @@ REGISTER_OP_KERNEL(matmul, MKLDNN, ::paddle::platform::CPUPlace, ops::DNNLMatMulKernel, ops::DNNLMatMulKernel, ops::DNNLMatMulKernel); + +REGISTER_OP_KERNEL(matmul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::MatMulGradMKLDNNKernel, + ops::MatMulGradMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..50afd417170e0f5fb633345d40552344a876786d --- /dev/null +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -0,0 +1,205 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using dnnl::memory; +using dnnl::primitive; +using framework::DataLayout; +using framework::ExecutionContext; +using platform::GetMKLDNNFormat; +using platform::MKLDNNDeviceContext; +using platform::MKLDNNGetDataType; +using platform::to_void_cast; +using Tensor = framework::Tensor; + +template +class MatMulV2MKLDNNHandler : public platform::MKLDNNHandlerT { + public: + MatMulV2MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine engine, platform::Place cpu_place, + std::vector& x_dims, bool trans_x, + std::vector& y_dims, bool trans_y, + const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, x_dims, uniq_name)) { + if (!this->isCached()) { + // M X K * K X N + const int MB_idx = x_dims.size() - 3; + const int H_idx = x_dims.size() - 2; + const int W_idx = x_dims.size() - 1; + + if (trans_x) std::swap(x_dims[H_idx], x_dims[W_idx]); + if (trans_y) std::swap(y_dims[H_idx], y_dims[W_idx]); + + const memory::dim M = x_dims[H_idx]; + const memory::dim K = x_dims[W_idx]; + const memory::dim N = y_dims[W_idx]; + + std::vector x_strides(x_dims.size() - 3, 1); + std::vector y_strides(x_dims.size() - 3, 1); + std::vector out_strides(x_dims.size() - 3, 1); + std::vector out_ddims(x_dims.size() - 3, 1); + + x_strides.reserve(x_dims.size()); + y_strides.reserve(x_dims.size()); + out_strides.reserve(x_dims.size()); + + if (!trans_x) { + x_strides.insert(x_strides.end(), {M * K, K, 1}); + } else { + x_strides.insert(x_strides.end(), {M * K, 1, M}); + } + + if (!trans_y) { + y_strides.insert(y_strides.end(), {N * K, N, 1}); + } else { + y_strides.insert(y_strides.end(), {N * K, 1, K}); + } + + out_strides.insert(out_strides.end(), {M * N, N, 1}); + out_ddims.insert(out_ddims.end(), + {std::max(x_dims[MB_idx], y_dims[MB_idx]), M, N}); + + for (int i = x_dims.size() - 4; i >= 0; --i) { + out_ddims[i] = std::max(x_dims[i], y_dims[i]); + x_strides[i] = x_dims[i + 1] * x_strides[i + 1]; + y_strides[i] = y_dims[i + 1] * y_strides[i + 1]; + out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; + } + + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); + auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); + auto out_md = + memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); + + this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md); + } + } + + std::shared_ptr AcquireWeightsMemory(const Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data), + "@weights_mem_p"); + } +}; + +template +class MatMulV2MKLDNNKernel : public framework::OpKernel { + public: + void Compute(const ExecutionContext& ctx) const override { RunKernel(ctx); } + + private: + void CalculateMatrixDims(const ExecutionContext& ctx, + const std::vector& x_dims, + const std::vector& y_dims, + std::vector& x_bd_dims, + std::vector& y_bd_dims, + std::vector& out_dims, Tensor* out) const { + if (x_dims.size() == 1) { + x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; + } else { + for (size_t i = 0; i < x_dims.size(); ++i) { + x_bd_dims[i] = x_dims[i]; + } + } + if (y_dims.size() == 1) { + y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; + } else { + for (size_t i = 0; i < y_dims.size(); ++i) { + y_bd_dims[i] = y_dims[i]; + } + } + + if ((y_dims.size() == x_dims.size()) && y_dims.size() > 2) { + for (size_t i = 0; i < x_dims.size() - 2; ++i) { + PADDLE_ENFORCE_EQ( + x_dims[i] == y_dims[i] || x_dims[i] == 1 || y_dims[i] == 1, true, + platform::errors::InvalidArgument( + "Tensor dimensions are incorrect for broadcasting." + "Dimensions in X and Y must be same or equal to 1, but " + "received x_dim[%d]=%d and y_dims[%d]= %d", + i, x_dims[i], i, y_dims[i])); + out_dims[i] = std::max(x_dims[i], y_dims[i]); + } + out->Resize(framework::make_ddim(out_dims)); + } + } + + void RunKernel(const ExecutionContext& ctx) const { + const auto& dev_ctx = ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + bool trans_x = ctx.Attr("trans_x"); + bool trans_y = ctx.Attr("trans_y"); + + auto x_dims = framework::vectorize(x->dims()); + auto y_dims = framework::vectorize(y->dims()); + auto out_dims = framework::vectorize(out->dims()); + + int ndims = std::max(x->dims().size(), y->dims().size()); + ndims = std::max(ndims, 3); + + std::vector x_bd_dims(ndims, 1); + std::vector y_bd_dims(ndims, 1); + + CalculateMatrixDims(ctx, x_dims, y_dims, x_bd_dims, y_bd_dims, out_dims, + out); + + MatMulV2MKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), + x_bd_dims, trans_x, y_bd_dims, trans_y, + ctx.InputName("X")); + + const auto src_memory_p = handler.AcquireSrcMemory(x); + const auto weights_memory_p = handler.AcquireWeightsMemory(y); + const auto dst_memory_p = handler.AcquireDstMemory(out); + + auto matmul_p = handler.AcquireForwardPrimitive(); + + std::unordered_map matmul_args = { + {DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + auto& astream = MKLDNNDeviceContext::tls().get_stream(); + matmul_p->execute(astream, matmul_args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format( + GetMKLDNNFormat(dst_memory_p->get_desc().reshape(out_dims))); + } +}; +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(matmul_v2, MKLDNN, ::paddle::platform::CPUPlace, + ops::MatMulV2MKLDNNKernel, + ops::MatMulV2MKLDNNKernel); + +// REGISTER_OP_KERNEL(matmul_grad_v2, MKLDNN, ::paddle::platform::CPUPlace, +// ops::MatMulV2GradMKLDNNKernel, +// ops::MatMulV2GradMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index b7bed95b1d33583682b997def63bb38243d1794d..920ec97a769b6d12bdcc28606813003b353f0aef 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -100,11 +100,10 @@ class PoolingMKLDNNHandler const auto is_test = ctx.Attr("is_test"); const auto dt = framework::ToMKLDNNDataType(input->type()); - const auto fmt = input->format(); const auto exclude_padding = ctx.Attr("exclusive"); - const auto src_md = mkldnn::memory::desc(src_tz, dt, fmt); + const auto src_md = mkldnn::memory::desc(src_tz, dt, input->format()); /* create memory descriptor for pooling without specified format * ('any') which lets a primitive (pooling in this case) choose * the memory format preferred for best performance @@ -200,6 +199,10 @@ class PoolingMKLDNNHandler auto diff_dst_tz = paddle::framework::vectorize(out_grad->dims()); + const auto dt = framework::ToMKLDNNDataType(in_x->type()); + auto src_md = mkldnn::memory::desc(src_tz, dt, in_x->format()); + auto dst_md = + mkldnn::memory::desc(diff_dst_tz, dt, MKLDNNMemoryFormat::any); auto diff_dst_md = mkldnn::memory::desc( diff_dst_tz, platform::MKLDNNGetDataType(), out_grad->format()); auto diff_src_md = @@ -216,6 +219,17 @@ class PoolingMKLDNNHandler ComputeAdaptivePoolParameters(ctx, diff_src_tz, &ksize, &strides); const auto exclude_padding = ctx.Attr("exclusive"); + + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, + pooling_type == "max" + ? mkldnn::algorithm::pooling_max + : (exclude_padding + ? mkldnn::algorithm::pooling_avg_exclude_padding + : mkldnn::algorithm::pooling_avg_include_padding), + src_md, dst_md, strides, ksize, mkldnn_paddings[0], + mkldnn_paddings[1]); + this->AcquireBackwardPrimitiveDescriptor( pooling_type == "max" ? mkldnn::algorithm::pooling_max diff --git a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e2a4482666a1ace818777e9e7e3abaa1e6ff2f22 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using dnnl::memory; +using framework::Tensor; +using platform::GetMKLDNNFormat; +using platform::MKLDNNDeviceContext; +using platform::MKLDNNGetDataType; +using platform::to_void_cast; + +namespace { +template +class PReluMKLDNNHandler + : public platform::MKLDNNHandlerT { + public: + PReluMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine engine, platform::Place cpu_place, + const Tensor* x, const Tensor* weights, + const std::string& uniq_name, const std::string& mode, + bool is_test = false) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), + uniq_name)) { + if (!this->isCached()) { + auto x_md = memory::desc(framework::vectorize(x->dims()), + MKLDNNGetDataType(), x->format()); + + auto weights_dims = framework::vectorize(weights->dims()); + + // weights must have same size as X only for "element" case + if (weights->dims().size() != x->dims().size()) { + auto new_weights_dims = std::vector(x->dims().size(), 1); + if (mode == "channel") { + new_weights_dims[1] = + *std::max_element(weights_dims.begin(), weights_dims.end()); + } + weights_dims = std::move(new_weights_dims); + } + auto weights_md = memory::desc(weights_dims, MKLDNNGetDataType(), + memory::format_tag::any); + + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training, + x_md, weights_md); + if (!is_test) + this->AcquireBackwardPrimitiveDescriptor(x_md, weights_md, x_md, + weights_md); + } + } + + std::shared_ptr AcquireWeightsMemoryPossiblyWithReorder( + const Tensor* input, const bool is_test) { + const T* input_data = input->data(); + + // if weights are 1D, every format tag is correct, so we accept + // format_tag::any's output and no reorder is needed + if (input->dims().size() == 1) { + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), + to_void_cast(input_data), + "@alpha_mem_p"); + } + + auto user_weights_md = + memory::desc(framework::vectorize(input->dims()), + MKLDNNGetDataType(), input->format()); + return this->AcquireMemoryWithReorder( + user_weights_md, this->fwd_pd_->weights_desc(), + to_void_cast(input_data), "@alpha_mem_p", is_test); + } + + std::shared_ptr AcquireDiffWeightsMemory(Tensor* output) { + T* output_data = output->mutable_data( + this->place_, this->bwd_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->bwd_pd_->diff_weights_desc(), + output_data, "@diff_weights_mem_p"); + } +}; +} // anonymous namespace + +template +class PReluMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + const auto* x = ctx.Input("X"); + const auto* alpha = ctx.Input("Alpha"); + auto* out = ctx.Output("Out"); + const bool is_test = ctx.Attr("is_test"); + const auto mode = ctx.Attr("mode"); + + PReluMKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), x, + alpha, ctx.InputName("X"), mode, is_test); + + auto src_memory_p = handler.AcquireSrcMemory(x); + auto weights_memory_p = + handler.AcquireWeightsMemoryPossiblyWithReorder(alpha, is_test); + auto dst_memory_p = handler.AcquireDstMemory(out); + auto prelu_p = handler.AcquireForwardPrimitive(); + + auto& astream = MKLDNNDeviceContext::tls().get_stream(); + prelu_p->execute(astream, {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(GetMKLDNNFormat(*dst_memory_p)); + } +}; + +template +class PReluGradMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("X"); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dalpha = ctx.Output(framework::GradVarName("Alpha")); + auto* alpha = ctx.Input("Alpha"); + const bool is_test = ctx.Attr("is_test"); + const auto mode = ctx.Attr("mode"); + + PReluMKLDNNHandler handler(dev_ctx, onednn_engine, ctx.GetPlace(), x, + alpha, framework::GradVarName("X"), mode); + + auto src_memory_p = handler.AcquireSrcMemory(x); + auto weights_memory_p = + handler.AcquireWeightsMemoryPossiblyWithReorder(alpha, is_test); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); + auto diff_weights_memory_p = handler.AcquireDiffWeightsMemory(dalpha); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout); + auto prelu_p = handler.AcquireBackwardPrimitive(); + + auto& astream = MKLDNNDeviceContext::tls().get_stream(); + prelu_p->execute(astream, + {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_WEIGHTS, *weights_memory_p}, + {DNNL_ARG_DIFF_DST, *diff_dst_memory_p}, + {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}, + {DNNL_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(GetMKLDNNFormat(*diff_src_memory_p)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(prelu, MKLDNN, paddle::platform::CPUPlace, + ops::PReluMKLDNNKernel, + ops::PReluMKLDNNKernel); + +REGISTER_OP_KERNEL(prelu_grad, MKLDNN, paddle::platform::CPUPlace, + ops::PReluGradMKLDNNKernel, + ops::PReluGradMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..ae17048b5d568baf4722e63299c9ef2ca3fb6bae --- /dev/null +++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class ScaleMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + bool is_inplaced = x->IsSharedBufferWith(*out); + + platform::ActivationMKLDNNHandler handler( + mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x, + ctx.InputName("X"), is_inplaced); + + auto src_memory_p = handler.AcquireSrcMemory(x); + auto dst_memory_p = handler.AcquireDstMemory(out); + auto activation_p = handler.AcquireForwardPrimitive(); + + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); + activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p}, + {MKLDNN_ARG_TO, *dst_memory_p}}); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(scale, MKLDNN, paddle::platform::CPUPlace, + ops::ScaleMKLDNNKernel, + ops::ScaleMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 1138d5113929329462a7ea6ccd01f1b7bc375322..e065800e4d1c71ee4bc47fe09b26ed1ea0b9d2c9 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -15,15 +15,6 @@ limitations under the License. */ #include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" -namespace paddle { -namespace framework { -class Tensor; -} // namespace framework -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { @@ -74,22 +65,36 @@ class SoftmaxMKLDNNHandler } } - SoftmaxMKLDNNHandler(const std::vector& dims, - const MKLDNNMemoryFormat fmt, - const MKLDNNMemoryFormat diff_fmt, const int& axis, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, const std::string& uniq_name) + SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const Tensor* out, + const Tensor* out_grad, Tensor* in_x_grad, + const std::string& unique_name) : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, uniq_name)) { - auto data_softmax_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - auto diff_softmax_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); - - this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, - axis); + platform::CreateKey(dev_ctx, framework::vectorize(out->dims()), + unique_name)) { + if (!this->isBwdCached()) { + PADDLE_ENFORCE_EQ( + out_grad->dims(), in_x_grad->dims(), + platform::errors::InvalidArgument("The shape of softmax_grad's input " + "and output must be identical.")); + + auto dims = out_grad->dims(); // input and output share the same shape + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + auto softmax_tz = framework::vectorize(dims); + + auto data_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out->format()); + auto diff_softmax_md = MKLDNNMemDesc( + softmax_tz, platform::MKLDNNGetDataType(), out_grad->format()); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, + data_softmax_md, axis); + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); + } } }; @@ -145,27 +150,15 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); const Tensor* output = ctx.Input("Out"); - auto* dout = ctx.template Input(framework::GradVarName("Out")); - auto* dx = - ctx.template Output(framework::GradVarName("X")); - - PADDLE_ENFORCE_EQ( - dout->dims(), dx->dims(), - platform::errors::InvalidArgument( - "The shape of softmax_grad's input and output must be identical.")); - - auto dims = dout->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - - auto softmax_tz = paddle::framework::vectorize(dims); + auto* out_grad = ctx.template Input(framework::GradVarName("Out")); + auto* in_x_grad = ctx.template Output(framework::GradVarName("X")); - SoftmaxMKLDNNHandler handler(softmax_tz, output->format(), - dout->format(), axis, dev_ctx, - ctx.GetPlace(), ctx.InputName("Out")); + SoftmaxMKLDNNHandler handler(ctx, dev_ctx, ctx.GetPlace(), output, + out_grad, in_x_grad, ctx.InputName("Out")); auto dst_memory_p = handler.AcquireDstMemory(output); - auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout); - auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(out_grad); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(in_x_grad); auto softmax_bwd_p = handler.AcquireBackwardPrimitive(); @@ -176,8 +169,8 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); astream.wait(); - dx->set_layout(framework::DataLayout::kMKLDNN); - dx->set_format(dout->format()); + in_x_grad->set_layout(framework::DataLayout::kMKLDNN); + in_x_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p)); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..afbe330305b7e10123a07e9b1418fe33064f76e8 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +static inline std::vector> CalculateOutsDims( + const framework::DDim& in_dims, const size_t num, + const std::vector& sections, const size_t axis, + const int outs_number) { + std::vector> outs_dims(outs_number, + framework::vectorize(in_dims)); + + if (num > 0) { + PADDLE_ENFORCE_EQ(in_dims[axis] % num, 0, + platform::errors::InvalidArgument( + "The input's size along the split dimension " + "must be evenly divisible by Attr(num_or_sections). " + "But received Attr(num_or_sections) " + "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", + num, in_dims, axis)); + + const size_t out_axis_dim = in_dims[axis] / num; + + for (auto& out_dim : outs_dims) out_dim[axis] = out_axis_dim; + } else { + for (size_t i = 0; i < outs_dims.size(); ++i) + outs_dims[i][axis] = sections[i]; + } + return outs_dims; +} + +template +class SplitMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + const auto* x = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + + int num = ctx.Attr("num"); + auto sections = ctx.Attr>("sections"); + int axis = ctx.Attr("axis"); + auto outs_number = outs.size(); + const auto x_dims = x->dims(); + + bool need_resize = false; + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + need_resize = true; + } + + auto sections_tensor_list = ctx.MultiInput("SectionsTensorList"); + if (sections_tensor_list.size() > 0) { + sections = GetDataFromTensorList(sections_tensor_list); + need_resize = true; + } + + if (need_resize) { + const auto outs_dims = + CalculateOutsDims(x->dims(), num, sections, axis, outs_number); + for (size_t i = 0; i < outs.size(); ++i) { + outs[i]->Resize(framework::make_ddim(outs_dims[i])); + } + } + + auto x_vec_dims = framework::vectorize(x_dims); + + mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type()); + auto key = platform::CreateKey(dev_ctx, x_vec_dims, axis, num, sections, + x->format(), x_type); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + std::vector offset(x_vec_dims.size(), 0); + + platform::ReorderMKLDNNHandler reorder_handler( + x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key); + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->format(), platform::to_void_cast(x->data())); + + for (size_t i = 0; i < outs_number; ++i) { + auto out_vec_dims = framework::vectorize(outs[i]->dims()); + auto slice_mem_p = reorder_handler.AcquireSrcSubmemory( + out_vec_dims, offset, reorder_src_memory_p, i); + + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + outs[i], out_vec_dims, i, x->format(), ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p, i); + + reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p); + + offset[axis] += num > 0 ? x->dims()[axis] / num : sections[i]; + + outs[i]->set_layout(framework::DataLayout::kMKLDNN); + outs[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } + astream.wait(); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(split, MKLDNN, paddle::platform::CPUPlace, + ops::SplitMKLDNNKernel, + ops::SplitMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 7618b1d9c31218bf6e15b048801a3bb196a94fce..1813aabf1d8548453932d5850dd48facc980b0ab 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -118,17 +118,6 @@ class SumMKLDNNHandler : public platform::MKLDNNHandlerT { inline int GetNumInputs(void) { return num_inputs_; } - protected: - // isCached need to be overloaded as base one works on key_common - bool isCached() { - const std::string key_pd = this->key_ + "@fwd_pd"; - this->fwd_pd_ = std::static_pointer_cast( - this->dev_ctx_.GetBlob(key_pd)); - - const std::string key_p = this->key_ + "@fwd_p"; - return (this->dev_ctx_.GetBlob(key_p) != nullptr); - } - private: int num_inputs_; std::vector srcs_suffix_; diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index aafff5248a0244e9090b10f6dc466c93eaa06888..cad4f47ec14022243ec04b50901a13f8d305a54e 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -50,7 +50,7 @@ class CacheTester { platform::CPUPlace place; onednn_dev_ctx_ = dynamic_cast(pool.Get(place)); - onednn_dev_ctx_->ResetBlobMap(); + onednn_dev_ctx_->ResetBlobMap(nullptr); } bool Analyze(unsigned short int num_entries) { @@ -180,17 +180,5 @@ TEST(test_elementwise_add_reuse_cache, cpu_place) { "Wrong number of cached oneDNN objects")); } -TEST(test_elementwises_sequence_reuse_cache, cpu_place) { - framework::DDim dims({32, 64}); - platform::CPUPlace p; - CacheTester ct; - RunOperator(p, "elementwise_add", dims, "elementwise_add_out", true); - RunOperator(p, "elementwise_mul", dims, "elementwise_add_out", true); - RunOperator(p, "relu", dims, "elementwise_add_out", true); - PADDLE_ENFORCE_EQ(ct.Analyze(11), true, - platform::errors::InvalidArgument( - "Wrong number of cached oneDNN objects")); -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 643de3fd5be70ea9aac29b93c501c1a6de8a7737..0612417c46ce30a73ce0cbc582be740023ff0ab6 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -128,12 +128,6 @@ TEST(test_softmax_inplace, cpu_place) { ASSERT_TRUE(TestMain(p, "softmax", dims, 1)); } -TEST(test_elementwise_add_inplace, cpu_place) { - framework::DDim dims({1, 12, 20, 20}); - platform::CPUPlace p; - ASSERT_TRUE(TestMain(p, "elementwise_add", dims, 2)); -} - TEST(test_relu_inplace, cpu_place) { framework::DDim dims({1, 12, 20, 20}); platform::CPUPlace p; diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index e0736239d40f289a11a1e1fd8380fcbad904a667..9dcf012d512a954b8c75f38cbb42680cee7cca57 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -36,7 +36,7 @@ class MulNPUKernel : public framework::OpKernel { if (x_num_col_dims == 1 && y_num_col_dims == 1) { if (x->dims().size() == 2 && y->dims().size() == 2) { out->mutable_data(ctx.GetPlace()); - auto runner = + const auto& runner = NpuOpRunner("MatMul", {*x, *y}, {*out}, {{"transpose_x1", false}, {"transpose_x2", false}}); @@ -46,15 +46,11 @@ class MulNPUKernel : public framework::OpKernel { Tensor tmp_x(x->type()); int64_t sec_dim = x->dims()[1] * x->dims()[2]; int64_t first_dim = x->dims()[0]; - tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - tmp_x.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *x, ctx.GetPlace(), - ctx.template device_context(), &tmp_x); + tmp_x.ShareDataWith(*x); tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); out->mutable_data(ctx.GetPlace()); // matmul - auto runner = + const auto& runner = NpuOpRunner("MatMul", {tmp_x, *y}, {*out}, {{"transpose_x1", false}, {"transpose_x2", false}}); runner.Run(stream); @@ -69,36 +65,39 @@ class MulNPUKernel : public framework::OpKernel { platform::errors::InvalidArgument( "now only support x_num_col_dims == 2: but got %d", x_num_col_dims)); - // flatten => x.shape=[6, 4] - Tensor tmp_x(x->type()); - int64_t first_dim = x->dims()[0] * x->dims()[1]; - int64_t sec_dim = x->dims()[2]; - tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - tmp_x.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *x, ctx.GetPlace(), - ctx.template device_context(), &tmp_x); - tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - - // matmul [6,4] , [4, 5] => [6, 5] - Tensor tmp_matmul(x->type()); - tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]})); - tmp_matmul.mutable_data(ctx.GetPlace()); - - auto runner_matmul = - NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner_matmul.Run(stream); - // reshape [6, 5] => [2, 3, 5] - (*out).Resize( - framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]})); - out->mutable_data(ctx.GetPlace(), x->type()); - framework::TensorCopy( - tmp_matmul, ctx.GetPlace(), - ctx.template device_context(), out); - (*out).Resize( - framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]})); + if (x->type() == framework::proto::VarType::FP16 && + y->type() == framework::proto::VarType::FP16) { + // NOTE: When the dim of the input and output shapes is inconsistent, + // (Boradcast) BatchMatMul NPU OP only support FP16. + out->mutable_data(ctx.GetPlace()); + const auto& runner = + NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, + {{"adj_x1", false}, {"adj_x2", false}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } else { + // flatten => x.shape=[6, 4] + Tensor tmp_x(x->type()); + int64_t first_dim = x->dims()[0] * x->dims()[1]; + int64_t sec_dim = x->dims()[2]; + tmp_x.ShareDataWith(*x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + + // matmul [6,4] , [4, 5] => [6, 5] + out->mutable_data(ctx.GetPlace()); + + Tensor tmp_out(x->type()); + tmp_out.ShareDataWith(*out); + tmp_out.Resize(framework::make_ddim({first_dim, y->dims()[1]})); + + const auto& runner_matmul = + NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_out}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + runner_matmul.Run(stream); + } } } }; @@ -121,7 +120,7 @@ class MulGradNPUKernel : public framework::OpKernel { if (x->dims().size() == 2 && y->dims().size() == 2) { if (dx) { dx->mutable_data(ctx.GetPlace()); - auto runner_dx = + const auto& runner_dx = NpuOpRunner("MatMul", {*dout, *y}, {*dx}, {{"transpose_x1", false}, {"transpose_x2", true}}); @@ -130,7 +129,7 @@ class MulGradNPUKernel : public framework::OpKernel { if (dy) { dy->mutable_data(ctx.GetPlace()); - auto runner_dy = + const auto& runner_dy = NpuOpRunner("MatMul", {*x, *dout}, {*dy}, {{"transpose_x1", true}, {"transpose_x2", false}}); @@ -142,14 +141,14 @@ class MulGradNPUKernel : public framework::OpKernel { if (dx) { // matmul [2, 5] * [12, 5] => [2, 12] dx->mutable_data(ctx.GetPlace()); - auto dx_dims = dx->dims(); - dx->Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]})); - auto runner_matmul = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + Tensor tmp_dx(x->type()); + tmp_dx.ShareDataWith(*dx); + tmp_dx.Resize(framework::make_ddim({dout->dims()[0], y->dims()[0]})); + + const auto& runner_matmul = + NpuOpRunner("MatMul", {*dout, *y}, {tmp_dx}, {{"transpose_x1", false}, {"transpose_x2", true}}); runner_matmul.Run(stream); - // reshape [2, 12] => [2, 3, 4] - dx->Resize(dx_dims); } if (dy) { @@ -157,14 +156,10 @@ class MulGradNPUKernel : public framework::OpKernel { Tensor tmp_x(x->type()); int64_t sec_dim = x->dims()[1] * x->dims()[2]; int64_t first_dim = x->dims()[0]; - tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - tmp_x.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *x, ctx.GetPlace(), - ctx.template device_context(), &tmp_x); + tmp_x.ShareDataWith(*x); tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); dy->mutable_data(ctx.GetPlace()); - auto runner_dy = + const auto& runner_dy = NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy}, {{"transpose_x1", true}, {"transpose_x2", false}}); @@ -181,39 +176,46 @@ class MulGradNPUKernel : public framework::OpKernel { Tensor tmp_dout(x->type()); int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1]; int64_t dout_sec_dim = dout->dims()[2]; - tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim})); - tmp_dout.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *dout, ctx.GetPlace(), - ctx.template device_context(), &tmp_dout); + tmp_dout.ShareDataWith(*dout); tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim})); if (dx) { - // tmp_dout * y [6,5] * [4,5] => [6, 4] - dx->mutable_data(ctx.GetPlace()); - auto dx_dims = dx->dims(); - dx->Resize(framework::make_ddim({dout_first_dim, y->dims()[0]})); - auto runner_matmul = - NpuOpRunner("MatMul", {tmp_dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - runner_matmul.Run(stream); - // reshape [2, 12] => [2, 3, 4] - dx->Resize(dx_dims); + // tmp_dout * y [2, 3, 5] * [4,5] => [2, 3, 4] + if (dout->type() == framework::proto::VarType::FP16 && + y->type() == framework::proto::VarType::FP16) { + // NOTE: When the dim of the input and output shapes is inconsistent, + // (Boradcast) BatchMatMul NPU OP only support FP16. + dx->mutable_data(ctx.GetPlace()); + const auto& runner = + NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, + {{"adj_x1", false}, {"adj_x2", true}}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } else { + dx->mutable_data(ctx.GetPlace()); + Tensor tmp_dx(x->type()); + tmp_dx.ShareDataWith(*dx); + tmp_dx.Resize(framework::make_ddim({dout_first_dim, y->dims()[0]})); + + const auto& runner_matmul = + NpuOpRunner("MatMul", {tmp_dout, *y}, {tmp_dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + runner_matmul.Run(stream); + } } if (dy) { // flatten x.shape [2,3,4] => [6, 4] Tensor tmp_x(x->type()); int64_t first_dim = x->dims()[0] * x->dims()[1]; int64_t sec_dim = x->dims()[2]; - tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - tmp_x.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *x, ctx.GetPlace(), - ctx.template device_context(), &tmp_x); + tmp_x.ShareDataWith(*x); tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); // mamtul [6,4] [6,5] =>[4,5] dy->mutable_data(ctx.GetPlace()); - auto runner_dy = + const auto& runner_dy = NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy}, {{"transpose_x1", true}, {"transpose_x2", false}}); runner_dy.Run(stream); diff --git a/paddle/fluid/operators/nop_op.cc b/paddle/fluid/operators/nop_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..876468f8a7eacaf931e4a76ca0f78f18a4279207 --- /dev/null +++ b/paddle/fluid/operators/nop_op.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class NopOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } +}; + +class NopOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) The input tensor of nop op.").AsDuplicable(); + AddOutput("Out", "(Tensor) The output tensor of nop op.").AsDuplicable(); + AddComment(R"DOC( +Nop Operator + +Do nothing, except let the input and output tensors occupy the memory and +establish the dependency between input and output tensors. +)DOC"); + } +}; + +template +class NopKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(nop, ops::NopOp, ops::NopOpMaker); + +REGISTER_OP_CPU_KERNEL(nop, ops::NopKernel); + +REGISTER_OP_CUDA_KERNEL(nop, ops::NopKernel); + +REGISTER_OP_NPU_KERNEL(nop, ops::NopKernel); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index 276bfa7b3281b9886c6561187c48aec4e9e847c5..4461941e85c2a5445a00c9a0c35f5f9c262d9984 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -32,6 +32,7 @@ namespace operators { static std::map DTYPE_2_ACL_DTYPE = { {framework::proto::VarType::BOOL, ACL_BOOL}, + {framework::proto::VarType::UINT8, ACL_UINT8}, {framework::proto::VarType::INT16, ACL_INT16}, {framework::proto::VarType::INT32, ACL_INT32}, {framework::proto::VarType::INT64, ACL_INT64}, @@ -74,28 +75,50 @@ aclrtStream GetCurrentNPUStream(int device_id) { return dev_ctx->stream(); } -NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) { - attr_ = aclopCreateAttr(); -} +NpuOpRunner::NpuOpRunner() {} + +NpuOpRunner::NpuOpRunner(const std::string &op_type) : op_type_(op_type) {} -NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector &inputs, +NpuOpRunner::NpuOpRunner(const std::string &op_type, + const std::vector &inputs, const std::vector &outputs, const NPUAttributeMap &attrs) : op_type_(op_type) { - attr_ = aclopCreateAttr(); AddInputs(inputs); AddOutputs(outputs); AddAttrs(attrs); } NpuOpRunner::~NpuOpRunner() { - // TODO(zhiqiu): handle free + VLOG(5) << "Free NpuOpRunner(" << this << ") of " << op_type_; + // Is it safe to free the descs/buffers after run called in host ? + aclopDestroyAttr(attr_); // return void + for (auto desc : input_descs_) { + aclDestroyTensorDesc(desc); + } + for (auto desc : output_descs_) { + aclDestroyTensorDesc(desc); + } + for (auto buffer : input_buffers_) { + PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer)); + } + for (auto buffer : output_buffers_) { + PADDLE_ENFORCE_NPU_SUCCESS(aclDestroyDataBuffer(buffer)); + } } const std::string &NpuOpRunner::Type() { return op_type_; } +NpuOpRunner &NpuOpRunner::SetType(const std::string &name) { + op_type_ = name; + return *this; +} + NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, const NPUAttribute &attr) { + if (!attr_) { + attr_ = aclopCreateAttr(); + } if (attr.type() == typeid(bool)) { PADDLE_ENFORCE_NPU_SUCCESS( aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr))); @@ -177,6 +200,46 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) { return *this; } +NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) { + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(tensor, mem_type)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(tensor)); + return *this; +} + +NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(platform::CPUPlace())); + Tensor host_tensor; + TensorFromVector(dims, *dev_ctx, &host_tensor); + host_tensors_.emplace_back(host_tensor); + + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(host_tensor)); + + return *this; +} + +NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(platform::CPUPlace())); + Tensor host_tensor; + TensorFromVector(dims, *dev_ctx, &host_tensor); + host_tensors_.emplace_back(host_tensor); + + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(host_tensor)); + + return *this; +} + NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) { // create aclTensorDesc output_descs_.emplace_back(CreateTensorDesc(tensor)); @@ -186,6 +249,8 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) { } NpuOpRunner &NpuOpRunner::AddInputs(const std::vector &tensors) { + input_descs_.reserve(tensors.size()); + input_buffers_.reserve(tensors.size()); for (auto tensor : tensors) { // create aclTensorDesc input_descs_.emplace_back(CreateTensorDesc(tensor)); @@ -211,6 +276,8 @@ NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector &names) { } NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector &tensors) { + output_descs_.reserve(tensors.size()); + output_buffers_.reserve(tensors.size()); for (auto tensor : tensors) { // create aclTensorDesc output_descs_.emplace_back(CreateTensorDesc(tensor)); @@ -254,21 +321,32 @@ std::vector &NpuOpRunner::GetOutputBuffers() { return output_buffers_; } -aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { +aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor, + aclMemType mem_type) { auto dtype = ConvertToNpuDtype(tensor.type()); auto format = ConvertToNpuFormat(tensor.layout()); auto dims = framework::vectorize(tensor.dims()); + int size = dims.size(); + // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU + // OP must be a scalar with shape[0]. At present, the shape + // of the `prob` Tensor of this OP is forced to be set to 0 + // in `npu_op_runner.cc`, which needs to be optimized later. + if (op_type_ == "DropOutGenMask" && size == 1 && *(dims.data()) == 1) { + size = 0; + } VLOG(4) << "NPU dtype:" << dtype << " " << "rank:" << dims.size() << " dims:" << tensor.dims() << " format:" << format; - auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format); + auto *desc = aclCreateTensorDesc(dtype, size, dims.data(), format); PADDLE_ENFORCE_NOT_NULL( desc, platform::errors::External("Call aclCreateTensorDesc failed.")); PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format)); - PADDLE_ENFORCE_NPU_SUCCESS( - aclSetTensorStorageShape(desc, dims.size(), dims.data())); + PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageShape(desc, size, dims.data())); + if (mem_type == ACL_MEMTYPE_HOST) { + PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorPlaceMent(desc, mem_type)); + } return desc; } @@ -281,12 +359,12 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) { return buffer; } -void NpuOpRunner::Run(aclrtStream stream) { +void NpuOpRunner::Run(aclrtStream stream) const { if (!stream) { VLOG(4) << "Run with default current npu stream: " << stream; stream = GetCurrentNPUStream(); } - + VLOG(5) << "NpuOpRunner(" << this << ") Run:"; VLOG(4) << "op_type: " << op_type_; VLOG(4) << "input_desc.size: " << input_descs_.size(); VLOG(4) << "output_desc.size: " << output_descs_.size(); diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 5506ddd89692b5c2811bf48acc8e020090c447e7..2257c209550d6056554f32cb2b7a36a277c15088 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include "acl/acl.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { @@ -30,25 +31,46 @@ using Tensor = framework::Tensor; using DataLayout = framework::DataLayout; using NPUAttribute = framework::NPUAttribute; using NPUAttributeMap = framework::NPUAttributeMap; +using DeviceContextPool = platform::DeviceContextPool; class NpuOpRunner { public: - explicit NpuOpRunner(std::string op_type); - explicit NpuOpRunner(std::string op_type, - const std::vector &inputs = {}, - const std::vector &outputs = {}, - const NPUAttributeMap &attrs = {}); + NpuOpRunner(); + explicit NpuOpRunner(const std::string &op_type); + NpuOpRunner(const std::string &op_type, + const std::vector &inputs = {}, + const std::vector &outputs = {}, + const NPUAttributeMap &attrs = {}); + + // NOTE(zhiqiu): why forbid copy and operator= ? + // Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner, + // if shallow copy is performed on tensor_descs and data_buffers, it may + // result + // in use-after-free bugs. + NpuOpRunner(const NpuOpRunner &runner) = delete; + NpuOpRunner &operator=(const NpuOpRunner &runner) = delete; ~NpuOpRunner(); const std::string &Type(); + NpuOpRunner &SetType(const std::string &name); + NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr); NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs); NpuOpRunner &AddInput(const Tensor &tensor); + // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host. + // Specifically, the tensor of shape, tensor of dims, etc, which are are small + // vector/list. + NpuOpRunner &AddInput(const Tensor &tensor, aclMemType mem_type); + + NpuOpRunner &AddInput(std::vector &&dims); + + NpuOpRunner &AddInput(std::vector &&dims); + NpuOpRunner &AddOutput(const Tensor &tensor); NpuOpRunner &AddInputs(const std::vector &tensors); @@ -69,10 +91,11 @@ class NpuOpRunner { std::vector &GetOutputBuffers(); - void Run(aclrtStream stream = nullptr); + void Run(aclrtStream stream = nullptr) const; private: - aclTensorDesc *CreateTensorDesc(Tensor tensor); + aclTensorDesc *CreateTensorDesc(Tensor tensor, + aclMemType mem_type = ACL_MEMTYPE_DEVICE); aclDataBuffer *CreateDataBuffer(Tensor tensor); private: @@ -81,6 +104,7 @@ class NpuOpRunner { std::vector output_buffers_; std::vector input_descs_; std::vector output_descs_; + std::vector host_tensors_; aclopAttr *attr_{nullptr}; }; @@ -96,31 +120,36 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) { PADDLE_ENFORCE_EQ( platform::is_npu_place(tensor->place()), true, platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); - // do async for better performance - if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { - Tensor tmp(tensor->type()); - tmp.Resize(tensor->dims()); - tmp.mutable_data(tensor->place()); - auto stream = GetCurrentNPUStream( - BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device); - platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), - stream); - auto runner = NpuOpRunner("Power", {tmp}, {*tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(0)}, - {"shift", static_cast(val)}}); - runner.Run(stream); - } else { - T *array = new T[tensor->numel()]; - for (unsigned int i = 0; i < tensor->numel(); ++i) { - array[i] = static_cast(val); - } - std::vector vec(tensor->numel(), static_cast(val)); - // do sync copy + + int numel = tensor->numel(); + if (numel == 1) { + Tensor npu_pinned_tensor(tensor->type()); + platform::NPUPinnedPlace npu_pinned_place; + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data({1}, npu_pinned_place); + *npu_pinned_ptr = val; + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), - tensor->data(), platform::CPUPlace(), array, - tensor->numel() * sizeof(T), nullptr); - delete[] array; + tensor->data(), npu_pinned_place, npu_pinned_ptr, + sizeof(T), GetCurrentNPUStream()); + + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation *allocation = + npu_pinned_tensor.Holder().get(); + + npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream()); + } else { + std::vector vec(numel, static_cast(val)); + auto device_id = platform::GetCurrentNPUDeviceId(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast( + pool.Get(platform::NPUPlace(device_id))); + + paddle::framework::TensorFromVector(vec, *dev_ctx, tensor); } } diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 621920731fb603c3f3fd526c19b51d7c08d6c954..7536654c5f5ccd7ea18911c9530c5ce42ba9ca3f 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -151,6 +151,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "as beta2, this has a higher priority than attr(beta2), the " "shape of this tensor MUST BE [1].") .AsDispensable(); + AddInput("EpsilonTensor", + "(Tensor, optional) If provided, Adam will use this " + "as epsilon, this has a higher priority than attr(epsilon), the " + "shape of this tensor MUST BE [1].") + .AsDispensable(); AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(Tensor) Output parameter"); @@ -193,6 +198,13 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) " "Whether to use multi-precision during weight updating.") .SetDefault(false); + // TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut + // as dispensable since they are not used when use_global_beta_pow is true. + AddAttr("use_global_beta_pow", + "(bool, default false) " + "Whether to use global beta_pow for whole model instead of " + "creating beta_pow for each parameter.") + .SetDefault(false); AddComment(R"DOC( Adam Optimizer. @@ -232,4 +244,25 @@ REGISTER_OP_VERSION(adam) paddle::framework::compatible::OpVersionDesc().NewAttr( "multi_precision", "(bool) Whether to use multi-precision during weight updating.", + false)) + .AddCheckpoint( + R"ROC( + Upgrade adam, add 1 dispensable input [EpsilonTensor]. + )ROC", + paddle::framework::compatible::OpVersionDesc().NewInput( + "EpsilonTensor", + "If provided, Adam will use this as epsilon, " + "this has a higher priority than attr(epsilon). " + "For better performance in npu kernel. ")) + .AddCheckpoint( + R"ROC( + Upgrade adam, add 1 attribute [use_global_beta_pow]. + )ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "use_global_beta_pow", + "If true, Adam will use global beta_pow for whole model " + "instead of creating beta_pow for each parameter." + "In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be " + "used in adam op, " + "and beta_pow will be updated after all adam op in the model.", false)); diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 54aea67f4ea1b3b3939702a962d9aed773416273..2ee2a08bf3bc63c34e18e668e9875d6ef6132951 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -154,7 +154,9 @@ class AdamOpCUDAKernel : public framework::OpKernel { int64_t min_row_size_to_use_multithread = ctx.Attr("min_row_size_to_use_multithread"); bool lazy_mode = ctx.Attr("lazy_mode"); - MPDType epsilon = static_cast(ctx.Attr("epsilon")); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + auto* param = ctx.Input("Param"); auto* grad_var = ctx.InputVar("Grad"); auto* mom1 = ctx.Input("Moment1"); @@ -188,6 +190,15 @@ class AdamOpCUDAKernel : public framework::OpKernel { beta2_tensor->numel())); beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } + MPDType epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() << "beta2_pow.numel() : " << beta2_pow->numel(); VLOG(3) << "param.numel(): " << param->numel(); @@ -245,11 +256,13 @@ class AdamOpCUDAKernel : public framework::OpKernel { lr->data(), grad->data(), param->data(), param_out->mutable_data(ctx.GetPlace()), master_in_data, master_out_data, param->numel()); - // Cpu update - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow->data()[0]; + if (!use_global_beta_pow) { + // Cpu update + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } } else { AdamKernelMEM<<>>( beta1, beta2, epsilon, beta1_pow->data(), @@ -260,14 +273,15 @@ class AdamOpCUDAKernel : public framework::OpKernel { lr->data(), grad->data(), param->data(), param_out->mutable_data(ctx.GetPlace()), master_in_data, master_out_data, param->numel()); - // Update with gpu - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1, beta2, beta1_pow->data(), - beta2_pow->data(), - beta1_pow_out->mutable_data(ctx.GetPlace()), - beta2_pow_out->mutable_data(ctx.GetPlace())); + if (!use_global_beta_pow) { + // Update with gpu + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } } - } else if (grad_var->IsType()) { auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { @@ -319,11 +333,13 @@ class AdamOpCUDAKernel : public framework::OpKernel { param_out->mutable_data(ctx.GetPlace()), master_in_data, master_out_data, rows, row_numel, grad_merge.rows().size(), lazy_mode, ndim); - // Update with cpu - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow->data()[0]; + if (!use_global_beta_pow) { + // Update with cpu + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } } else { SparseAdamFunctor functor( beta1, beta2, epsilon, beta1_pow->data(), @@ -342,12 +358,14 @@ class AdamOpCUDAKernel : public framework::OpKernel { ctx.device_context()), param->numel()); for_range(functor); - // update beta1 and beta2 - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1, beta2, beta1_pow->data(), - beta2_pow->data(), - beta1_pow_out->mutable_data(ctx.GetPlace()), - beta2_pow_out->mutable_data(ctx.GetPlace())); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } } } else { PADDLE_THROW(platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 6356911f0676a84798aafcbc596f5e7bc0174584..bbd4179d84d896d16a6d7e0c8a4fcfbdf039a71d 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -406,7 +406,9 @@ class AdamOpKernel : public framework::OpKernel { int64_t min_row_size_to_use_multithread = ctx.Attr("min_row_size_to_use_multithread"); bool lazy_mode = ctx.Attr("lazy_mode"); - T epsilon = static_cast(ctx.Attr("epsilon")); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + auto* param = ctx.Input("Param"); auto* grad_var = ctx.InputVar("Grad"); auto* mom1 = ctx.Input("Moment1"); @@ -440,6 +442,15 @@ class AdamOpKernel : public framework::OpKernel { beta2_tensor->numel())); beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } + T epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() << "beta2_pow.numel() : " << beta2_pow->numel(); VLOG(3) << "param.numel(): " << param->numel(); @@ -466,11 +477,12 @@ class AdamOpKernel : public framework::OpKernel { lr->data(), grad->data(), param->data(), param_out->mutable_data(ctx.GetPlace())); functor(param->numel()); - beta1_pow_out->mutable_data(ctx.GetPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(ctx.GetPlace())[0] = - beta2 * beta2_pow->data()[0]; - + if (!use_global_beta_pow) { + beta1_pow_out->mutable_data(ctx.GetPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(ctx.GetPlace())[0] = + beta2 * beta2_pow->data()[0]; + } } else if (grad_var->IsType()) { auto* grad = ctx.Input("Grad"); if (grad->rows().size() == 0) { @@ -514,10 +526,12 @@ class AdamOpKernel : public framework::OpKernel { param_out->mutable_data(ctx.GetPlace()), rows, row_numel, grad_merge.rows().size(), lazy_mode); // update beta1 and beta2 - beta1_pow_out->mutable_data(ctx.GetPlace())[0] = - beta1 * beta1_pow->data()[0]; - beta2_pow_out->mutable_data(ctx.GetPlace())[0] = - beta2 * beta2_pow->data()[0]; + if (!use_global_beta_pow) { + beta1_pow_out->mutable_data(ctx.GetPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(ctx.GetPlace())[0] = + beta2 * beta2_pow->data()[0]; + } if (lazy_mode) { VLOG(3) << "run cpu lazy mode"; size_t row_count = grad_merge.rows().size(); diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index a922a2bca66ad685bd4de341d0fcfd07b4bd0197..70fd546e5042c3ae96ec333c251e72396fef0e59 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -36,7 +36,6 @@ class AdamNPUKernel : public framework::OpKernel { "but the received is %s", ctx.InputNames("Param").front(), framework::ToTypeName(param_var->Type()))); - T epsilon = static_cast(ctx.Attr("epsilon")); auto* param = ctx.Input("Param"); auto* grad_var = ctx.InputVar("Grad"); PADDLE_ENFORCE_EQ(grad_var->IsType(), true, @@ -50,8 +49,8 @@ class AdamNPUKernel : public framework::OpKernel { auto* mom2 = ctx.Input("Moment2"); auto* lr = ctx.Input("LearningRate"); - auto* beta1_pow = ctx.Input("Beta1Pow"); - auto* beta2_pow = ctx.Input("Beta2Pow"); + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); auto* param_out = ctx.Output("ParamOut"); auto* mom1_out = ctx.Output("Moment1Out"); @@ -59,45 +58,77 @@ class AdamNPUKernel : public framework::OpKernel { auto* beta1_pow_out = ctx.Output("Beta1PowOut"); auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + param_out->mutable_data(ctx.GetPlace()); mom1_out->mutable_data(ctx.GetPlace()); mom2_out->mutable_data(ctx.GetPlace()); - // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place. + // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform + // place. + LoDTensor beta1_pow_tmp; + LoDTensor beta2_pow_tmp; if (beta1_pow->place() == platform::CPUPlace()) { T beta1 = *beta1_pow->data(); - // `mutable_data` operation needs to be done after getting data - beta1_pow_out->mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(beta1_pow_out, beta1); - } else { - beta1_pow_out->mutable_data(ctx.GetPlace()); + beta1_pow_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta1_pow_tmp, beta1); + beta1_pow = &beta1_pow_tmp; } if (beta2_pow->place() == platform::CPUPlace()) { T beta2 = *beta2_pow->data(); - beta2_pow_out->mutable_data(ctx.GetPlace()); - FillNpuTensorWithConstant(beta2_pow_out, beta2); - } else { - beta2_pow_out->mutable_data(ctx.GetPlace()); + beta2_pow_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta2_pow_tmp, beta2); + beta2_pow = &beta2_pow_tmp; } - T beta1 = static_cast(ctx.Attr("beta1")); + const Tensor* beta1_tensor = nullptr; + const Tensor* beta2_tensor = nullptr; + const Tensor* epsilon_tensor = nullptr; + + Tensor beta1_tmp(framework::proto::VarType::FP32); + Tensor beta2_tmp(framework::proto::VarType::FP32); + Tensor epsilon_tmp(framework::proto::VarType::FP32); + if (ctx.HasInput("Beta1Tensor")) { - auto* beta1_tensor = ctx.Input("Beta1Tensor"); + beta1_tensor = ctx.Input("Beta1Tensor"); PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, platform::errors::InvalidArgument( "Input(Beta1Tensor) size must be 1, but get %d", beta1_tensor->numel())); - beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } else { + T beta1 = static_cast(ctx.Attr("beta1")); + beta1_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta1_tmp, beta1); + beta1_tensor = &beta1_tmp; } - T beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { - auto* beta2_tensor = ctx.Input("Beta2Tensor"); - PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, platform::errors::InvalidArgument( "Input(Beta2Tensor) size must be 1, but get %d", beta2_tensor->numel())); - beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } else { + T beta2 = static_cast(ctx.Attr("beta2")); + beta2_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta2_tmp, beta2); + beta2_tensor = &beta2_tmp; + } + + if (ctx.HasInput("EpsilonTensor")) { + epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + } else { + T epsilon = static_cast(ctx.Attr("epsilon")); + epsilon_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&epsilon_tmp, epsilon); + epsilon_tensor = &epsilon_tmp; } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() << "beta2_pow.numel() : " << beta2_pow->numel(); VLOG(3) << "param.numel(): " << param->numel(); @@ -113,27 +144,14 @@ class AdamNPUKernel : public framework::OpKernel { "beta2 pow output size should be 1, but received " "value is:%d.", beta2_pow_out->numel())); - - // reshape - Tensor beta1_tensor(framework::proto::VarType::FP32); - beta1_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta1_tensor, beta1); - Tensor beta2_tensor(framework::proto::VarType::FP32); - beta2_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&beta2_tensor, beta2); - - Tensor epsilon_tensor(framework::proto::VarType::FP32); - TensorFromVector(std::vector{epsilon}, - ctx.template device_context(), - &epsilon_tensor); auto stream = ctx.template device_context() .stream(); - auto runner = + const auto& runner = NpuOpRunner("ApplyAdamD", { *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr, - beta1_tensor, beta2_tensor, epsilon_tensor, *grad, + *beta1_tensor, *beta2_tensor, *epsilon_tensor, *grad, }, { *param_out, *mom1_out, *mom2_out, @@ -158,12 +176,16 @@ class AdamNPUKernel : public framework::OpKernel { *mom2, ctx.GetPlace(), ctx.template device_context(), mom2_out); } - auto runner_m1 = - NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {}); - runner_m1.Run(stream); - auto runner_m2 = - NpuOpRunner("Mul", {*beta2_pow, beta2_tensor}, {*beta2_pow_out}, {}); - runner_m2.Run(stream); + if (!use_global_beta_pow) { + beta1_pow_out->mutable_data(ctx.GetPlace()); + beta2_pow_out->mutable_data(ctx.GetPlace()); + const auto& runner_m1 = + NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {}); + runner_m1.Run(stream); + const auto& runner_m2 = + NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {}); + runner_m2.Run(stream); + } } }; diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 3baba424e8f43d801451a27670d131fe136db3e9..0f5706e428e15454e216af8e1067d31720cbf7c7 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -35,8 +35,6 @@ class AdamOpXPUKernel : public framework::OpKernel { framework::ToTypeName(param_var->Type()))); using paddle::framework::LoDTensor; - T epsilon = static_cast(ctx.Attr("epsilon")); - auto& param = GET_DATA_SAFELY(ctx.Input("Param"), "Input", "Param", "Adam"); // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); @@ -75,6 +73,9 @@ class AdamOpXPUKernel : public framework::OpKernel { "value is:%d.", beta2_pow_out->numel())); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + T beta1 = static_cast(ctx.Attr("beta1")); if (ctx.HasInput("Beta1Tensor")) { auto* beta1_tensor = ctx.Input("Beta1Tensor"); @@ -85,6 +86,11 @@ class AdamOpXPUKernel : public framework::OpKernel { auto* beta2_tensor = ctx.Input("Beta2Tensor"); beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } + T epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } if (grad_var->IsType()) { auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", "Grad", "Adam"); @@ -108,45 +114,48 @@ class AdamOpXPUKernel : public framework::OpKernel { mom1_out.template mutable_data(ctx.GetPlace()), mom2_out.template mutable_data(ctx.GetPlace()), param_out.template mutable_data(ctx.GetPlace()), param.numel()); - - // update in cpu and then copy to xpu - if (beta1_pow.place() == platform::CPUPlace() && - beta2_pow.place() == platform::CPUPlace()) { - const T* beta1_pow_p = beta1_pow.template data(); - beta1_pow_out->mutable_data(platform::CPUPlace())[0] = - beta1 * beta1_pow_p[0]; - const T* beta2_pow_p = beta2_pow.template data(); - beta2_pow_out->mutable_data(platform::CPUPlace())[0] = - beta2 * beta2_pow_p[0]; - } else { - T cpu_beta1_pow_out_data; - T cpu_beta2_pow_out_data; - memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data, - BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()), - beta1_pow_ptr, sizeof(T)); - - cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1; - memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data, - BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()), - beta2_pow_ptr, sizeof(T)); - - cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2; - - T* beta1_pow_out_p = beta1_pow_out->mutable_data(ctx.GetPlace()); - T* beta2_pow_out_p = beta2_pow_out->mutable_data(ctx.GetPlace()); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - beta1_pow_out_p, platform::CPUPlace(), - &cpu_beta1_pow_out_data, sizeof(T)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - beta2_pow_out_p, platform::CPUPlace(), - &cpu_beta2_pow_out_data, sizeof(T)); + if (!use_global_beta_pow) { + // update in cpu and then copy to xpu + if (beta1_pow.place() == platform::CPUPlace() && + beta2_pow.place() == platform::CPUPlace()) { + const T* beta1_pow_p = beta1_pow.template data(); + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow_p[0]; + const T* beta2_pow_p = beta2_pow.template data(); + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow_p[0]; + + } else { + T cpu_beta1_pow_out_data; + T cpu_beta2_pow_out_data; + + memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data, + BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()), + beta1_pow_ptr, sizeof(T)); + + cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1; + memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data, + BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()), + beta2_pow_ptr, sizeof(T)); + + cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2; + + T* beta1_pow_out_p = beta1_pow_out->mutable_data(ctx.GetPlace()); + T* beta2_pow_out_p = beta2_pow_out->mutable_data(ctx.GetPlace()); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + beta1_pow_out_p, platform::CPUPlace(), + &cpu_beta1_pow_out_data, sizeof(T)); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + beta2_pow_out_p, platform::CPUPlace(), + &cpu_beta2_pow_out_data, sizeof(T)); + } + + PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, + platform::errors::External( + "XPU API return wrong value[%d], please check " + "where Baidu Kunlun Card is properly installed.", + r)); } - - PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, - platform::errors::External( - "XPU API return wrong value[%d], please check " - "where Baidu Kunlun Card is properly installed.", - r)); } else { PADDLE_ENFORCE_EQ(1, 2, platform::errors::InvalidArgument( "Variable type not supported by adam_op")); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc old mode 100755 new mode 100644 index 479f9643749d63c673158ad055409a0925f3d576..8f30dd5b2e68a4d15d849141b175b8eae503b170 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("LearningRate", "(LoDTensor, default LoDTensor) " "Input learning rate"); + AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(LoDTensor) This output is updated parameter. " @@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("VelocityOut", "(LoDTensor) This output is updated velocity. " "It shared memory with Input(Velocity)."); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable(); AddAttr("mu", "(float) Momentum coefficient"); AddAttr("lars_coeff", "(float, default 0.001) LARS coefficient.") @@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("epsilon", "(float, default 0.0) epsilon to avoid Division by Zero.") .SetDefault(0.0); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); + AddAttr( + "rescale_grad", + "(float, default 1.0) Multiply the gradient with `rescale_grad`" + "before updating. Often choose to be `1.0/batch_size`.") + .SetDefault(1.0f); AddComment(R"DOC( Lars Momentum Optimizer. diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index eb0111ae4de2f066359e26406f6c7ec3eb54d5fc..42477232e7ca1b23c53d88eecaa7e13c4197ecbd 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -13,36 +13,64 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" namespace paddle { namespace operators { template -__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v, - const T* learning_rate, const T mu, - const int64_t num, const T lars_coeff, - const T lars_weight_decay, const T* p_norm, - const T* g_norm, T* p_out, T* v_out, - const T epsilon) { - T lr = learning_rate[0]; - T local_lr = learning_rate[0]; +using MultiPrecisionType = typename details::MPTypeTrait::Type; + +template +__global__ void MomentumLarsKernel( + const T* p, const T* g, const MT* v, + const MultiPrecisionType* learning_rate, const MT mu, const int64_t num, + const MT lars_coeff, const MT lars_weight_decay, + const MultiPrecisionType* p_norm, const MultiPrecisionType* g_norm, + T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out, + const MultiPrecisionType rescale_grad) { + const MT lr = static_cast(learning_rate[0]); + MT local_lr = lr; + const MT p_n = static_cast(p_norm[0]); + const MT g_n = static_cast(g_norm[0]); + + if (lars_weight_decay > static_cast(0) && p_n > static_cast(0) && + g_n > static_cast(0)) { + local_lr = + lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon); + } CUDA_KERNEL_LOOP(i, num) { - if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) { - local_lr = lr * lars_coeff * p_norm[0] / - (g_norm[0] + lars_weight_decay * p_norm[0] + epsilon); - } + MT grad = static_cast(g[i]) * static_cast(rescale_grad); + MT param = master_p ? master_p[i] : static_cast(p[i]); + + MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param); + MT p_new = param - v_new; - T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]); v_out[i] = v_new; - p_out[i] = p[i] - v_new; + p_out[i] = static_cast(p_new); + if (master_p_out) master_p_out[i] = p_new; } } template class LarsMomentumOpCUDAKernel : public framework::OpKernel { + using MPDType = MultiPrecisionType; + public: void Compute(const framework::ExecutionContext& ctx) const override { + const bool multi_precision = ctx.Attr("multi_precision"); + if (multi_precision) { + InnerCompute(ctx, multi_precision); + } else { + InnerCompute(ctx, multi_precision); + } + } + + private: + template + void InnerCompute(const framework::ExecutionContext& ctx, + const bool multi_precision) const { auto param_out = ctx.Output("ParamOut"); auto velocity_out = ctx.Output("VelocityOut"); auto param = ctx.Input("Param"); @@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { auto grad = ctx.Input("Grad"); auto learning_rate = ctx.Input("LearningRate"); + const framework::Tensor* master_param = nullptr; + framework::Tensor* master_param_out = nullptr; + if (multi_precision) { + bool has_master = + ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); + PADDLE_ENFORCE_EQ(has_master, true, + platform::errors::InvalidArgument( + "The Input(MasterParam) and Output(MasterParamOut) " + "should not be null when " + "the attr `multi_precision` is true")); + master_param = ctx.Input("MasterParam"); + master_param_out = ctx.Output("MasterParamOut"); + } + + const MT* master_p = multi_precision ? master_param->data() : nullptr; + MT* master_p_out = multi_precision + ? master_param_out->mutable_data(ctx.GetPlace()) + : nullptr; + T* p_out = param_out->mutable_data(ctx.GetPlace()); - T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + MT* v_out = velocity_out->mutable_data(ctx.GetPlace()); - T mu = static_cast(ctx.Attr("mu")); - T lars_coeff = ctx.Attr("lars_coeff"); - T lars_weight_decay = ctx.Attr("lars_weight_decay"); - T epsilon = ctx.Attr("epsilon"); + MT mu = static_cast(ctx.Attr("mu")); + MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); + MT lars_weight_decay = + static_cast(ctx.Attr("lars_weight_decay")); + MT epsilon = static_cast(ctx.Attr("epsilon")); + MPDType rescale_grad = + static_cast(ctx.Attr("rescale_grad")); auto* p = param->data(); - auto* v = velocity->data(); auto* g = grad->data(); - auto* lr = learning_rate->data(); + auto* v = velocity->data(); + auto* lr = learning_rate->data(); int block = 512; int grid = (param->numel() + block - 1) / block; @@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { framework::Tensor p_norm_t, g_norm_t; p_norm_t.Resize({1}); g_norm_t.Resize({1}); - auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); - auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); - auto ep_norm = framework::EigenScalar::From(p_norm_t); - auto eg_norm = framework::EigenScalar::From(g_norm_t); + auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); + auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); + auto ep_norm = framework::EigenScalar::From(p_norm_t); + auto eg_norm = framework::EigenScalar::From(g_norm_t); auto* place = ctx.template device_context().eigen_device(); - ep_norm.device(*place) = eigen_p.square().sum().sqrt(); - eg_norm.device(*place) = eigen_g.square().sum().sqrt(); - MomentumLarsKernel<<>>( + + // eigen unsupport fp16 l2-norm + ep_norm.device(*place) = + eigen_p.template cast().square().sum().sqrt(); + eg_norm.device(*place) = + (eigen_g.template cast() * rescale_grad).square().sum().sqrt(); + + MomentumLarsKernel< + T, MT><<>>( p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, - p_norm_data, g_norm_data, p_out, v_out, epsilon); + p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out, + rescale_grad); } }; @@ -93,4 +150,6 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( lars_momentum, ops::LarsMomentumOpCUDAKernel, - ops::LarsMomentumOpCUDAKernel); + ops::LarsMomentumOpCUDAKernel, + ops::LarsMomentumOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index cbb0704fa857b7021acf91ca2f606c3d88aa76a6..f461dec66c0e753cdf170a958f585fa609cd8dac 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel { ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim); + if (ctx->HasOutput("MasterParamOut")) { + ctx->SetOutputDim("MasterParamOut", param_dim); + } } framework::OpKernelType GetExpectedKernelType( diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h index 076121c0e27da7f8292de272bdb8ea38fdf33a0d..076afdc655386c080e3fde99fbba42d3acf59651 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.h +++ b/paddle/fluid/operators/optimizers/sgd_op.h @@ -19,6 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/operators/jit/kernels.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/operators/mkldnn/axpy_handler.h" +#endif #include "paddle/fluid/platform/bfloat16.h" namespace paddle { @@ -139,9 +142,15 @@ struct sgd_dense_param_kernel< "Got [%s], but expected less than [%s]", grad_rows[i], grad_height)); const int64_t row = grad_rows[i]; +#ifdef PADDLE_WITH_MKLDNN + operators::onednn_handler_axpy(grad_width, -lr[0], + grad_data + i * grad_width, + out_data + row * grad_width); +#else for (int64_t j = 0; j < grad_width; ++j) { out_data[row * grad_width + j] -= lr[0] * grad_data[i * grad_width + j]; } +#endif } } }; diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc index a8d19148ef520cc2b80b23e119e56f5a7b6f920f..446f578b79ff96171f39f8b0bfe3aede03190f5c 100644 --- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc +++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc @@ -32,7 +32,7 @@ class SGDNPUKernel : public framework::OpKernel { param_out->mutable_data(ctx.GetPlace()); - auto runner = + const auto& runner = NpuOpRunner("ApplyGradientDescent", {*param_var, *learning_rate, *grad_var}, {*param_out}, {}); diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc index 95aaed4453517dd81fcfb277f46df8020be3ac11..087b8ecba6e1fb8b4a0ec44bf6b4dffd5b0e3fb5 100644 --- a/paddle/fluid/operators/pad_constant_like_op.cc +++ b/paddle/fluid/operators/pad_constant_like_op.cc @@ -246,3 +246,18 @@ REGISTER_OP_CPU_KERNEL( ops::PadConstantLikeGradKernel, ops::PadConstantLikeGradKernel); + +REGISTER_OP_CUDA_KERNEL( + pad_constant_like, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel, + ops::PadConstantLikeKernel); +REGISTER_OP_CUDA_KERNEL( + pad_constant_like_grad, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel, + ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/pad_constant_like_op.cu b/paddle/fluid/operators/pad_constant_like_op.cu deleted file mode 100644 index 76faf30ed92000d7093eb73bf6499a43f6ab5b57..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad_constant_like_op.cu +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/pad_constant_like_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - pad_constant_like, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel, - ops::PadConstantLikeKernel); -REGISTER_OP_CUDA_KERNEL( - pad_constant_like_grad, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel, - ops::PadConstantLikeGradKernel); diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index 577f4f39411e290a88a91bafb61f7dafa7c1cb5f..3bf66c77badb90543e8351c3bca71418d47ff046 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -174,3 +174,16 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( pad_grad, ops::PadGradKernel, ops::PadGradKernel); + +REGISTER_OP_CUDA_KERNEL( + pad, ops::PadKernel, + ops::PadKernel, + ops::PadKernel, + ops::PadKernel, + ops::PadKernel); +REGISTER_OP_CUDA_KERNEL( + pad_grad, ops::PadGradKernel, + ops::PadGradKernel, + ops::PadGradKernel); diff --git a/paddle/fluid/operators/pad_op.cu b/paddle/fluid/operators/pad_op.cu deleted file mode 100644 index 391e305352e55188fb0c502b8efe03af597d48ca..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/pad_op.cu +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/pad_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - pad, ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel, - ops::PadKernel); -REGISTER_OP_CUDA_KERNEL( - pad_grad, ops::PadGradKernel, - ops::PadGradKernel, - ops::PadGradKernel); diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index 9117b1b95ed26d03e30c59aa1f77e5de1c2b7755..e84c92d9a1624d2dd569c35461744689ea30eb27 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -31,7 +31,11 @@ namespace operators { template struct DivideFunctor { HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } + + template + HOSTDEVICE inline U operator()(const U& x) const { + return x * static_cast(n_inv); + } private: T n_inv; diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 8a18843a97263689efed737741c71dc19f593897..b5509e760e8380eb0d85545670d67d346ce3796b 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -95,9 +95,17 @@ class PReluOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -126,6 +134,18 @@ There are modes: )DOC"); AddAttr("mode", "The mode for inputs to share weights.") .SetDefault("all"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); } }; @@ -153,9 +173,17 @@ class PReluGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/prelu_op.h b/paddle/fluid/operators/prelu_op.h index cfc0a2b6fb1128ee4460cbc669772c6257aad8ab..60fd75ce3cffd3e0565945b281ad4c4961385956 100644 --- a/paddle/fluid/operators/prelu_op.h +++ b/paddle/fluid/operators/prelu_op.h @@ -39,13 +39,19 @@ class PReluKernel : public framework::OpKernel { int index = 0; int i = 0; if (mode == "channel") { - int temp = numel / (dim[0] * dim[1]); + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } for (i = 0; i < numel; i++) { index = (i / temp) % dim[1]; o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; } } else if (mode == "element") { - int temp = numel / dim[0]; + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } for (i = 0; i < numel; i++) { index = i % temp; o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i]; @@ -75,18 +81,23 @@ class PReluGradKernel : public framework::OpKernel { auto dim = x->dims(); int index = 0; int i = 0; - int temp = 0; if (dx) { T* dx_ptr = dx->mutable_data(context.GetPlace()); if (mode == "channel") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } for (i = 0; i < numel; i++) { - temp = numel / (dim[0] * dim[1]); index = (i / temp) % dim[1]; dx_ptr[i] = x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i]; } } else if (mode == "element") { - temp = numel / dim[0]; + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } for (i = 0; i < numel; i++) { index = i % temp; dx_ptr[i] = @@ -105,13 +116,19 @@ class PReluGradKernel : public framework::OpKernel { memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel()); if (mode == "channel") { + int temp = 1; + for (int j = 2; j < dim.size(); j++) { + temp *= dim[j]; + } for (i = 0; i < numel; i++) { - temp = numel / (dim[0] * dim[1]); index = (i / temp) % dim[1]; dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; } } else if (mode == "element") { - temp = numel / dim[0]; + int temp = 1; + for (int j = 1; j < dim.size(); j++) { + temp *= dim[j]; + } for (i = 0; i < numel; i++) { index = i % temp; dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i]; diff --git a/paddle/fluid/operators/pscore/CMakeLists.txt b/paddle/fluid/operators/pscore/CMakeLists.txt index 12168e61ba5a98fd18c08b2b97911a2e11c02eac..e4d654008d3d03f5136493bf3719636a6c7daf96 100644 --- a/paddle/fluid/operators/pscore/CMakeLists.txt +++ b/paddle/fluid/operators/pscore/CMakeLists.txt @@ -27,7 +27,7 @@ register_operators(DEPS ${DISTRIBUTE_DEPS}) set(OPERATOR_DEPS ${OPERATOR_DEPS} ${DISTRIBUTE_DEPS} PARENT_SCOPE) set_source_files_properties(heter_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op) +cc_test(heter_server_test SRCS heter_server_test.cc DEPS ${RPC_DEPS} ${DISTRIBUTE_DEPS} executor scope proto_desc scale_op eigen_function) set_source_files_properties(heter_listen_and_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS}) +cc_test(heter_listen_and_server_test SRCS heter_listen_and_server_test.cc DEPS executor scope proto_desc scale_op heter_listen_and_serv_op ${RPC_DEPS} ${DISTRIBUTE_DEPS} eigen_function) diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc index 1d072936f409cf34042ec342ca4a04aaddda3f80..df2eb70b144e4a3cd14384cd4077f44950f89c92 100644 --- a/paddle/fluid/operators/pscore/heter_server_test.cc +++ b/paddle/fluid/operators/pscore/heter_server_test.cc @@ -20,6 +20,8 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/distributed/service/heter_client.h" #include "paddle/fluid/distributed/service/heter_server.h" +#include "paddle/fluid/framework/op_registry.h" + namespace framework = paddle::framework; namespace platform = paddle::platform; namespace distributed = paddle::distributed; diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc index 65e10181dcc3df06395ae5cae65efb251021857e..ce6db633c9566e77a6b581fea45b781b75d60e17 100644 --- a/paddle/fluid/operators/py_layer_op.cc +++ b/paddle/fluid/operators/py_layer_op.cc @@ -62,13 +62,22 @@ void RunPyObject(py::object *py_object, for (size_t i = 0; i < result_tuple.size(); i++) { if ((*outs)[i] != nullptr) { if (Py_None != result_tuple[i].ptr()) { - try { - auto result_var = - result_tuple[i].cast>(); - *(*outs)[i] = result_var->Var(); - } catch (py::cast_error &) { + if (py::isinstance(result_tuple[i])) { + try { + auto result_var = + result_tuple[i].cast>(); + *(*outs)[i] = result_var->Var(); + } catch (py::cast_error &) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.backward` function returns invalid argument, " + "the `%s` type argument can not be cast into `Tensor`.", + result_tuple[i].ptr()->ob_type->tp_name)); + } + } else { PADDLE_THROW(platform::errors::InvalidArgument( - "The output of `PyLayer.backward` should be `Tensor`.")); + "The output of `PyLayer.backward` should be `Tensor`, but " + "received `%s`.", + result_tuple[i].ptr()->ob_type->tp_name)); } } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -86,15 +95,30 @@ void RunPyObject(py::object *py_object, } } } else { + if (1 != outs->size()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The number of outputs of `PyLayer.backward` should be %d, but " + "received 1.", + outs->size())); + } if ((*outs)[0] != nullptr) { if (Py_None != py_result.ptr()) { - try { - auto result_var = - py_result.cast>(); - *((*outs)[0]) = result_var->Var(); - } catch (py::cast_error &) { + if (py::isinstance(py_result)) { + try { + auto result_var = + py_result.cast>(); + *((*outs)[0]) = result_var->Var(); + } catch (py::cast_error &) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The `PyLayer.backward` function returns invalid argument, the " + "`%s` type argument can not be cast into `Tensor`.", + py_result.ptr()->ob_type->tp_name)); + } + } else { PADDLE_THROW(platform::errors::InvalidArgument( - "The output of `PyLayer.backward` should be `Tensor`.")); + "The output of `PyLayer.backward` should be `Tensor`, but " + "received `%s`", + py_result.ptr()->ob_type->tp_name)); } } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -151,9 +175,12 @@ class PyLayerOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto &op_ = ctx.GetOp(); - auto pylayer_op = dynamic_cast(&op_); - if (pylayer_op) { - auto py_layer_context = pylayer_op->GetPyLayerContext(); + auto const_pylayer_op = dynamic_cast(&op_); + if (const_pylayer_op) { + auto pylayer_op = const_cast(const_pylayer_op); + + // Release contex after executing the compute + auto py_layer_context = pylayer_op->ReleasePyLayerContext(); py::object bk_ctx(py::handle(py_layer_context->GetMutableCtx()), true); auto &input_vars = ctx.MultiInputVar("X"); auto output_vars = ctx.MultiOutputVar("Out"); @@ -190,9 +217,9 @@ REGISTER_OP_CPU_KERNEL( ops::PyLayerOpKernel, ops::PyLayerOpKernel, ops::PyLayerOpKernel, + ::paddle::platform::complex>, ops::PyLayerOpKernel); + ::paddle::platform::complex>); #ifdef PADDLE_WITH_CUDA REGISTER_OP_CUDA_KERNEL( py_layer, ops::PyLayerOpKernel, @@ -209,7 +236,7 @@ REGISTER_OP_CUDA_KERNEL( ops::PyLayerOpKernel, ops::PyLayerOpKernel, ops::PyLayerOpKernel, + ::paddle::platform::complex>, ops::PyLayerOpKernel); + ::paddle::platform::complex>); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h index 133435aa84d71ed32350f25746c5b66c5ba636bf..d80faab90b223622ef18b6244325206bb12156bf 100644 --- a/paddle/fluid/operators/py_layer_op.h +++ b/paddle/fluid/operators/py_layer_op.h @@ -34,6 +34,10 @@ class PyLayerContext { PyLayerContext() = delete; PyObject* GetMutableCtx() { return context_; } + ~PyLayerContext() { + py::gil_scoped_acquire guard; + Py_XDECREF(context_); + } private: PyObject* context_; @@ -58,8 +62,11 @@ class PyLayerOp : public framework::OperatorWithKernel { void SetPyLayerContext(const std::shared_ptr& py_context) { py_context_ = py_context; } - const std::shared_ptr& GetPyLayerContext() const { - return py_context_; + std::shared_ptr ReleasePyLayerContext() { + auto temp = py_context_; + py_context_.reset(); + VLOG(3) << "`py_context_` in the PyLayerOp is released."; + return temp; } private: diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index ee111a0ec7c0997d9d0380cd4be0c60683b0d3b1..0ebfb2f1bcd2203f987cdc656f0142eff4e009d2 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -59,16 +59,6 @@ HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out, size_t offset_i = offsets[i]; if (i == rank - 1) { - PADDLE_ENFORCE(x_stride == 1, - "When i:%d == rank:%d - 1, x_stride of random_crop_op " - "expected to be 1, but got %ld. Please check input " - "value.", - i, rank, x_stride); - PADDLE_ENFORCE(out_stride == 1, - "When i:%d == rank:%d - 1, out_stride of random_crop_op " - "expected to be 1, but got %ld. Please check input " - "value.", - i, rank, out_stride); x += offset_i; for (size_t j = 0; j < out_dim_i; ++j) { *out++ = *x++; diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index ec9d1fde4533580f862e35d01fbdb6dd0143495a..01f5b4c73271291f0a0eec8f9ff59412700656ce 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -231,3 +231,10 @@ REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL( rank_loss_grad, ops::RankLossGradKernel); + +REGISTER_OP_CUDA_KERNEL(rank_loss, + paddle::operators::RankLossKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(rank_loss_grad, + paddle::operators::RankLossGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/rank_loss_op.cu b/paddle/fluid/operators/rank_loss_op.cu deleted file mode 100644 index ed805279892d0f045fdde94b30c9bc7b19348a9a..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/rank_loss_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/rank_loss_op.h" - -REGISTER_OP_CUDA_KERNEL(rank_loss, - paddle::operators::RankLossKernel< - paddle::platform::CUDADeviceContext, float>); -REGISTER_OP_CUDA_KERNEL(rank_loss_grad, - paddle::operators::RankLossGradKernel< - paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/rank_loss_op.h b/paddle/fluid/operators/rank_loss_op.h index 8609958476f60a0c03b399f8fa2a00b29f3a9011..3373c846ce2c4cade675637cd51e12181172e13b 100644 --- a/paddle/fluid/operators/rank_loss_op.h +++ b/paddle/fluid/operators/rank_loss_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -36,8 +37,8 @@ class RankLossKernel : public framework::OpKernel { auto right = framework::EigenVector::Flatten(*right_t); auto& dev = *ctx.template device_context().eigen_device(); - out.device(dev) = - (1.0f + (left - right).exp()).log() - label * (left - right); + EigenRankLoss, T>::Eval(dev, out, label, left, + right); } }; @@ -65,15 +66,15 @@ class RankLossGradKernel : public framework::OpKernel { if (d_left_t) { d_left_t->mutable_data(ctx.GetPlace()); auto d_left = framework::EigenVector::Flatten(*d_left_t); - d_left.device(dev) = - d_out * (1.0f / (1.0f + (right - left).exp()) - label); + EigenRankLossGrad, T>::EvalLeft( + dev, d_left, d_out, label, left, right); } // compute d_right if (d_right_t) { d_right_t->mutable_data(ctx.GetPlace()); auto d_right = framework::EigenVector::Flatten(*d_right_t); - d_right.device(dev) = - -d_out * (1.0f / (1.0f + (right - left).exp()) - label); + EigenRankLossGrad, T>::EvalRight( + dev, d_right, d_out, label, left, right); } } }; diff --git a/paddle/fluid/operators/read_file_op.cc b/paddle/fluid/operators/read_file_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6da92ed7df7d8ea63ea015cd91783edcc4c5d81b --- /dev/null +++ b/paddle/fluid/operators/read_file_op.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class CPUReadFileKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto filename = ctx.Attr("filename"); + + std::ifstream input(filename.c_str(), + std::ios::in | std::ios::binary | std::ios::ate); + std::streamsize file_size = input.tellg(); + + input.seekg(0, std::ios::beg); + + auto* out = ctx.Output("Out"); + std::vector out_shape = {file_size}; + out->Resize(framework::make_ddim(out_shape)); + + uint8_t* data = out->mutable_data(ctx.GetPlace()); + + input.read(reinterpret_cast(data), file_size); + } +}; + +class ReadFileOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::InvalidArgument( + "Output(Out) of ReadFileOp is null.")); + + auto out_dims = std::vector(1, -1); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::UINT8, + platform::CPUPlace()); + } +}; + +class ReadFileOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "The output tensor of ReadFile op"); + AddComment(R"DOC( +This operator read a file. +)DOC"); + AddAttr("filename", "Path of the file to be readed.") + .SetDefault({}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + read_file, ops::ReadFileOp, ops::ReadFileOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(read_file, ops::CPUReadFileKernel) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index f5d55791d86c68bf800b869ee2be981bd6ab63b5..17c84530b23e667d8da4bf18cf44a89d44b1b51e 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -68,7 +68,6 @@ BufferedReader::BufferedReader( stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); } #endif - is_same_place_ = false; cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); npu_buffer_.resize(buffer_size); @@ -116,7 +115,7 @@ void BufferedReader::ReadAsync(size_t i) { std::vector cuda_pinned_ptrs; cuda_pinned_ptrs.reserve(cpu.size()); platform::RecordEvent record_event("BufferedReader:MemoryCopy"); - // NODE(chenwehiang): When we use CUDAPinned Memory, we need call + // NODE(chenweihang): When we use CUDAPinned Memory, we need call // cudaHostAlloc, that is a CUDA API, calling CUDA API need load // cuda lib into device, it will cost hundreds of MB of GPU memory. // If we don't set Device here, which will use CUDAPlace(0) default. @@ -126,18 +125,21 @@ void BufferedReader::ReadAsync(size_t i) { if (platform::is_cpu_place(cpu[i].place())) { cuda[i].Resize(cpu[i].dims()); cuda[i].set_layout(cpu[i].layout()); - cuda_pinned_ptrs.emplace_back( - cuda[i].mutable_data(cuda_pinned_place, cpu[i].type())); + cuda_pinned_ptrs[i] = + cuda[i].mutable_data(cuda_pinned_place, cpu[i].type()); auto size = cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); memory::Copy(cuda_pinned_place, cuda_pinned_ptrs[i], BOOST_GET_CONST(platform::CPUPlace, cpu[i].place()), cpu[i].data(), size); + cuda[i].set_lod(cpu[i].lod()); } else { - // we set same place flag & use cpu[i] directly - is_same_place_ = true; + // Here the cpu[i]'s place may be CUDAPlace, CUDAPinnedPlace, or + // others, we don't copy the memory of it to CUDAPinnedPlace, but + // we should share tensor data to cuda[i] + cuda[i].ShareDataWith(cpu[i]); } } } else { @@ -296,9 +298,9 @@ void BufferedReader::ReadNextImpl(std::vector *out) { return; } - if (platform::is_gpu_place(place_) && !is_same_place_) { + if (platform::is_gpu_place(place_)) { *out = std::move(cuda_buffer_[i]); - } else if (platform::is_npu_place(place_) && !is_same_place_) { + } else if (platform::is_npu_place(place_)) { *out = std::move(npu_buffer_[i]); } else { *out = std::move(cpu_buffer_[i]); diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 9f7b0e753281eb2e6476bc931b454b3b15340c3c..5b4bbc7d62cd8f1cdb64b0454279dada2f1a0e69 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -67,7 +67,6 @@ class BufferedReader : public framework::DecoratedReader { // buffer, just read async and create futures as buffer size. However, to // malloc tensors every time is extremely slow. Here we store all data in // buffers and prevent alloc every time. - bool is_same_place_; std::vector cpu_buffer_; std::vector cuda_buffer_; std::vector npu_buffer_; diff --git a/paddle/fluid/operators/real_op.cc b/paddle/fluid/operators/real_op.cc index 5f667999ee613961c44195836bcd36b0530a5c36..1174e72a76b1bb5aa744b964e289f0ac9c66596c 100644 --- a/paddle/fluid/operators/real_op.cc +++ b/paddle/fluid/operators/real_op.cc @@ -95,11 +95,11 @@ REGISTER_OPERATOR(real, ops::RealOp, ops::RealOpMaker, REGISTER_OPERATOR(real_grad, ops::RealGradOp); REGISTER_OP_CPU_KERNEL(real, ops::RealKernel, + paddle::platform::complex>, ops::RealKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL(real_grad, ops::RealGradKernel, + paddle::platform::complex>, ops::RealGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/real_op.cu b/paddle/fluid/operators/real_op.cu index b3d0855111b72f3eba4d9e737b4b650042f7238a..9bfb2878a6261bb5c69a1fb543e5aa15a87c5a8f 100644 --- a/paddle/fluid/operators/real_op.cu +++ b/paddle/fluid/operators/real_op.cu @@ -18,11 +18,11 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(real, ops::RealKernel, + paddle::platform::complex>, ops::RealKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL(real_grad, ops::RealGradKernel, + paddle::platform::complex>, ops::RealGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/reduce_ops/cub_reduce.h b/paddle/fluid/operators/reduce_ops/cub_reduce.h index 29e46e091d06858378cb31a1005ec5687797e583..0aab680e13dc1e570f39773cea6370a31bf1ccea 100644 --- a/paddle/fluid/operators/reduce_ops/cub_reduce.h +++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h @@ -31,6 +31,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" namespace paddle { namespace operators { @@ -66,39 +67,66 @@ struct Array { T data_[ElementCount]; }; +// reduce the 1d array to one element +template +__global__ void ReduceKernel1D(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, MPType init, + int reduce_num) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + MPType local_data = init; + for (int i = thread_id; i < reduce_num; i += gridDim.x * blockDim.x) { + local_data = static_cast( + reducer(local_data, static_cast(transformer(x[i])))); + } + __syncthreads(); + + local_data = BlockReduce(temp_storage).Reduce(local_data, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = static_cast(local_data); + } +} + // reduce the last axis of 2d array -template +template __global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, + TransformOp transformer, MPType init, int reduce_num) { - __shared__ typename cub::BlockReduce::TempStorage temp_storage; + __shared__ + typename cub::BlockReduce::TempStorage temp_storage; int idx_x = blockIdx.x * reduce_num; int idx_y = threadIdx.x; - Ty reduce_var = init; + MPType reduce_var = init; for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) reduce_var = - reducer(reduce_var, static_cast(transformer(x[idx_x + idx_y]))); + reducer(reduce_var, static_cast(transformer(x[idx_x + idx_y]))); __syncthreads(); - reduce_var = - cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + reduce_var = cub::BlockReduce(temp_storage) + .Reduce(reduce_var, reducer); if (threadIdx.x == 0) { - y[blockIdx.x] = reduce_var; + y[blockIdx.x] = static_cast(reduce_var); } } -template +template __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, int reduce_num, - Array x_strides, + TransformOp transformer, MPType init, + int reduce_num, Array x_strides, Array reduce_dim, Array reduce_strides, Array left_dim, Array left_strides) { - __shared__ typename cub::BlockReduce::TempStorage temp_storage; + __shared__ + typename cub::BlockReduce::TempStorage temp_storage; Array sub_index; int left_idx = blockIdx.x; for (int i = 0; i < Rank - ReduceRank; ++i) { @@ -114,7 +142,7 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, int idx_x = 0; for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); - Ty reduce_var = static_cast(transformer(x[idx_x])); + MPType reduce_var = static_cast(transformer(x[idx_x])); for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { int reduce_idx = i; @@ -125,16 +153,16 @@ __global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, int idx_x = 0; for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); - reduce_var = static_cast( - reducer(reduce_var, static_cast(transformer(x[idx_x])))); + reduce_var = static_cast( + reducer(reduce_var, static_cast(transformer(x[idx_x])))); } __syncthreads(); - reduce_var = - cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + reduce_var = cub::BlockReduce(temp_storage) + .Reduce(reduce_var, reducer); if (threadIdx.x == 0) { - y[blockIdx.x] = reduce_var; + y[blockIdx.x] = static_cast(reduce_var); } } @@ -192,6 +220,53 @@ static inline void CheckReduceRankIsValid(int reduce_rank, int rank) { } } +template +typename std::enable_if::value, + void>::type +LaunchCubReduceKernel(const Tx* x_data, Ty* y_data, + const platform::Place& place, const ReduceOp& reducer, + const TransformOp& transformer, const MPType& init, + int reduce_num, gpuStream_t stream) { + cub::TransformInputIterator trans_x(x_data, + transformer); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); + framework::Tensor tmp; + auto* temp_storage = tmp.mutable_data( + framework::make_ddim({static_cast(temp_storage_bytes)}), place); + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); +} + +template +typename std::enable_if::value, + void>::type +LaunchCubReduceKernel(const Tx* x_data, Ty* y_data, + const platform::Place& place, const ReduceOp& reducer, + const TransformOp& transformer, const MPType& init, + int reduce_num, gpuStream_t stream) { + int element_per_block = BlockDim * 10; + int block_per_grid = (reduce_num + element_per_block - 1) / element_per_block; + + framework::Tensor tmp; + auto* temp_storage = tmp.mutable_data( + framework::make_ddim( + {static_cast(block_per_grid * sizeof(MPType))}), + place); + + // each block reduce number to interim result + ReduceKernel1D<<>>( + x_data, temp_storage, reducer, transformer, init, reduce_num); + // reduce all number to final result + ReduceKernel1D<<<1, BlockDim, 0, stream>>>( + temp_storage, y_data, reducer, transformer, init, block_per_grid); +} + template static void TensorReduceImpl( @@ -201,45 +276,40 @@ static void TensorReduceImpl( const std::vector& reduce_dim, const std::vector& reduce_strides, const std::vector& left_dim, const std::vector& left_strides, gpuStream_t stream) { + using MPType = typename details::MPTypeTrait::Type; + MPType init_mp = static_cast(init); + #define CUB_RANK_CASE(i, ...) \ case i: { \ constexpr auto kRank = i; \ switch (reduce_rank) { __VA_ARGS__; } \ } break -#define CUB_REDUCE_RANK_CASE(i, ...) \ - case i: { \ - constexpr auto kReduceRank = i; \ - ReduceKernel<<>>( \ - x_data, y_data, reducer, transformer, init, reduce_num, \ - Array::From(x_strides), \ - Array::From(reduce_dim), \ - Array::From(reduce_strides), \ - Array::From(left_dim), \ - Array::From(left_strides)); \ +#define CUB_REDUCE_RANK_CASE(i, ...) \ + case i: { \ + constexpr auto kReduceRank = i; \ + ReduceKernel<<>>( \ + x_data, y_data, reducer, transformer, init_mp, reduce_num, \ + Array::From(x_strides), \ + Array::From(reduce_dim), \ + Array::From(reduce_strides), \ + Array::From(left_dim), \ + Array::From(left_strides)); \ } break int rank = x_strides.size(); int reduce_rank = reduce_strides.size(); if (rank == reduce_rank) { - cub::TransformInputIterator trans_x( - x_data, transformer); - size_t temp_storage_bytes = 0; - cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, - reduce_num, reducer, init, stream); - framework::Tensor tmp; - auto* temp_storage = tmp.mutable_data( - framework::make_ddim({static_cast(temp_storage_bytes)}), - place); - cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, - reduce_num, reducer, init, stream); + LaunchCubReduceKernel( + x_data, y_data, place, reducer, transformer, init_mp, reduce_num, + stream); return; } if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { - ReduceKernel2D<<>>( - x_data, y_data, reducer, transformer, init, reduce_num); + x_data, y_data, reducer, transformer, init_mp, reduce_num); return; } /* @@ -366,33 +436,31 @@ void TensorReduce(const framework::Tensor& x, framework::Tensor* y, #undef CUB_BLOCK_DIM_CASE } -template +template class TransformOp> struct TensorReduceFunctor { const framework::Tensor& x; framework::Tensor* y; std::vector origin_reduce_dims; const double& init; const ReduceOp& reducer; - const TransformOp& transformer; gpuStream_t stream; TensorReduceFunctor(const framework::Tensor& x, framework::Tensor* y, std::vector origin_reduce_dims, const double& init, - const ReduceOp& reducer, const TransformOp& transformer, - gpuStream_t stream) + const ReduceOp& reducer, gpuStream_t stream) : x(x), y(y), origin_reduce_dims(origin_reduce_dims), init(init), reducer(reducer), - transformer(transformer), stream(stream) {} template void apply() const { const Ty& init_cast = static_cast(init); - TensorReduce( - x, y, origin_reduce_dims, init_cast, reducer, transformer, stream); + TensorReduce>(x, y, origin_reduce_dims, + init_cast, reducer, + TransformOp(), stream); } }; diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..9cc8ac200b8eec1505177ce752ed8f103908f46a --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/logsumexp_op_xpu.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/reduce_ops/logsumexp_op.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { + +template +class XPULogsumexpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto axis = context.Attr>("axis"); + auto reduce_all = context.Attr("reduce_all"); + + const auto& input_dim_size = input->dims().size(); + // The dims has full dim, set the reduce_all is True + reduce_all |= (static_cast(axis.size()) == input_dim_size); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(context.GetPlace()); + + std::vector axis_shape; + std::vector xdims(input_dim_size); + for (int i = 0; i < input_dim_size; ++i) { + xdims[i] = input->dims()[i]; + } + if (reduce_all) { + for (int i = 0; i < input_dim_size; ++i) { + axis_shape.push_back(i); + } + } else { + for (size_t i = 0; i < axis.size(); ++i) { + int rdim = axis[i] < 0 ? axis[i] + input_dim_size : axis[i]; + axis_shape.push_back(rdim); + } + } + + auto& dev_ctx = context.template device_context(); + int r = xpu::logsumexp(dev_ctx.x_context(), input_data, output_data, + xdims, axis_shape); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External( + "XPU logsumexp kernel error! error value[%d %]", r, + XPUAPIErrorMsg[r])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + logsumexp, + ops::XPULogsumexpKernel); +#endif diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc index 33daeea8599c64c205f4587837f0271496aaa713..dfba933940bd0209c3a1754fbdcf830ba8dd55c7 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc @@ -45,7 +45,8 @@ class ReduceMeanGradMKLDNNKernel : public ReduceGradMKLDNNKernel { number_of_elements = input_x->numel(); } - this->RunKernel(ctx, dnnl::algorithm::binary_add, 0.0f, + this->RunKernel(ctx, dnnl::algorithm::binary_add, + dnnl::algorithm::reduction_mean, 0.0f, 1.0L / number_of_elements); } }; diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h index 58416f479c04354f24ad113d6a69e84fedae6b07..40cd3ba974f04c0196101f432cf8d51f2b00ce34 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h @@ -21,6 +21,27 @@ using paddle::framework::LoDTensor; using paddle::framework::Tensor; using platform::to_void_cast; +inline std::vector CalculateReducedDims(const Tensor* input, + const Tensor* output, + std::vector& reduce_dims, + bool reduce_all, + bool keep_dim) { + if (keep_dim) return framework::vectorize(output->dims()); + + if (reduce_all) + return std::vector(framework::vectorize(input->dims()).size(), 1); + + std::vector output_dims(framework::vectorize(input->dims())); + for (size_t i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = (reduce_dims[i] >= 0) + ? reduce_dims[i] + : input->dims().size() + reduce_dims[i]; + output_dims[reduce_dims[i]] = 1; + } + + return output_dims; +} + template class ReduceMKLDNNKernel : public framework::OpKernel { public: @@ -37,9 +58,8 @@ class ReduceMKLDNNKernel : public framework::OpKernel { bool reduce_all = ctx.Attr("reduce_all"); bool keep_dim = ctx.Attr("keep_dim"); - std::vector output_dims = - CalculateOutputDims(input, output, reduce_dims, reduce_all, keep_dim); - + auto output_dims = + CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim); auto input_dims = framework::vectorize(input->dims()); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -96,53 +116,63 @@ class ReduceMKLDNNKernel : public framework::OpKernel { paddle::framework::vectorize(output->dims())))); } } - - private: - std::vector CalculateOutputDims(const Tensor* input, - const Tensor* output, - std::vector& reduce_dims, - bool reduce_all, - bool keep_dim) const { - if (keep_dim) return framework::vectorize(output->dims()); - - if (reduce_all) - return std::vector(framework::vectorize(input->dims()).size(), - 1); - - std::vector output_dims(framework::vectorize(input->dims())); - for (size_t i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = (reduce_dims[i] >= 0) - ? reduce_dims[i] - : input->dims().size() + reduce_dims[i]; - output_dims[reduce_dims[i]] = 1; - } - - return output_dims; - } }; template class ReduceGradMKLDNNKernel : public framework::OpKernel { public: void RunKernel(const framework::ExecutionContext& ctx, - dnnl::algorithm binary_type, float scale_x, - float scale_y) const { + dnnl::algorithm binary_type, dnnl::algorithm reduction_type, + float scale_x, float scale_y) const { const auto& dev_ctx = ctx.template device_context(); const auto& onednn_engine = dev_ctx.GetEngine(); + bool keep_dim = ctx.Attr("keep_dim"); + bool reduce_all = ctx.Attr("reduce_all"); auto dims = ctx.Attr>("dim"); auto* input_dy = ctx.Input(framework::GradVarName("Out")); auto* output_dx = ctx.Output(framework::GradVarName("X")); + mkldnn::memory::format_tag x_format_tag; + auto input_dims = + CalculateReducedDims(output_dx, input_dy, dims, reduce_all, keep_dim); + + if (input_dims != framework::vectorize(output_dx->dims())) { + const std::string key_pd = + platform::CreateKey( + dev_ctx, framework::vectorize(output_dx->dims()), + ctx.InputName("X"), + (std::to_string(static_cast(reduction_type)))) + + "@fwd_pd"; + std::shared_ptr fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_pd)); + + PADDLE_ENFORCE_NOT_NULL( + fwd_pd, platform::errors::Unavailable( + "Forward primitive descriptor is not available in %s op, " + "cannot deduce memory format tag", + ctx.Type())); + + x_format_tag = platform::GetMKLDNNFormat(fwd_pd->src_desc()); + + PADDLE_ENFORCE_NE(x_format_tag, mkldnn::memory::format_tag::undef, + platform::errors::InvalidArgument( + "Cannot deduce format tag for %s op", ctx.Type())); + } else { // fwd descriptor not available because reorder was used instead + // of reduction + x_format_tag = getPlainFormatTag(output_dx); + } + output_dx->mutable_data(ctx.GetPlace()); - output_dx->set_format(getPlainFormatTag(output_dx)); + output_dx->set_format(x_format_tag); output_dx->set_layout(input_dy->layout()); platform::BroadcastDataMKLDNNHandler handler( binary_type, dev_ctx, onednn_engine, ctx.GetPlace(), output_dx, input_dy, scale_x, scale_y, - ctx.InputName(framework::GradVarName("Out"))); + ctx.InputName(framework::GradVarName("Out")), input_dims); const auto src_dx_memory = handler.AcquireSrcMemory(output_dx); const auto src_dy_memory = handler.AcquireSecondSrcMemory(input_dy); diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc index e62edcf559677e3cef3582b46dd0cdbc01b82e30..3f92d39ede1ae8cbc564e9e68f54c72c0160f75c 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc @@ -29,7 +29,8 @@ template class ReduceSumGradMKLDNNKernel : public ReduceGradMKLDNNKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - this->RunKernel(ctx, dnnl::algorithm::binary_add, 0.0f, 1.0f); + this->RunKernel(ctx, dnnl::algorithm::binary_add, + dnnl::algorithm::reduction_sum, 0.0f, 1.0f); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 89f3345fcbe42deb572700cb12827d79cb22d3d3..99a5caaad6ab802facaec6a3b5c4c5e2384945d4 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +// reduce_prod REGISTER_OP_CUDA_KERNEL( - reduce_all, ops::BoolReduceKernel); + reduce_all, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index c0f94098a351ea9042e44b8550b305bb0f9d74c6..c7eafa2ac8760a3edde56a9f2411c6faaac454f1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -13,7 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +// reduce_prod REGISTER_OP_CUDA_KERNEL( - reduce_any, ops::BoolReduceKernel); + reduce_any, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc index 39e74c908ae7ab5c420f07a559804d5aa5a9c216..e9d5c5f14c51f827353f54d1c84b50578ab7d41a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu.cc @@ -38,7 +38,7 @@ class ReduceAnyNPUKernel : public framework::OpKernel { // set attr NPUAttributeMap attr = {{"keep_dims", keep_dim}, {"axes", dims}}; - auto runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr); + const auto& runner = NpuOpRunner("ReduceAnyD", {*x}, {*out}, attr); auto stream = ctx.template device_context() .stream(); diff --git a/paddle/fluid/operators/reduce_ops/reduce_functor_op.h b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0f02be21cc90783bb35aba419aebc2bceaca0125 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_functor_op.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" +#include "paddle/fluid/platform/hostdevice.h" +#ifdef __HIPCC__ +#include +#endif + +namespace paddle { +namespace operators { + +template +struct CustomMin { + using Transformer = detail::IdentityFunctor; + + inline Ty initial() { + return static_cast(std::numeric_limits::max()); + } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return (b < a) ? b : a; + } +}; + +template +struct CustomMax { + using Transformer = detail::IdentityFunctor; + + inline Ty initial() { + return static_cast(std::numeric_limits::lowest()); + } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return (b > a) ? b : a; + } +}; + +// for cub::Reduce +template +struct CustomSum { + using Transformer = detail::IdentityFunctor; + + inline Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return b + a; + } +}; + +template +struct CustomMean { + using Transformer = detail::DivideFunctor; + + inline Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return b + a; + } +}; + +template +struct CustomMul { + using Transformer = detail::IdentityFunctor; + + inline Ty initial() { return static_cast(1.0f); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return b * a; + } +}; + +template +struct CustomLogicalOr { + using Transformer = detail::IdentityFunctor; + + inline Ty initial() { return static_cast(false); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return b || a; + } +}; + +template +struct CustomLogicalAnd { + using Transformer = detail::IdentityFunctor; + + inline Ty initial() { return static_cast(true); } + + __device__ __forceinline__ Ty operator()(const Ty &a, const Ty &b) const { + return b && a; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu index 832112ede833a06e053dcff5139e82f054b127c4..f214fcba199a3690d05acc7d78da5bcad16d18cf 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op.cu @@ -11,15 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL(reduce_max, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +// reduce_max +REGISTER_OP_CUDA_KERNEL( + reduce_max, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index fdb2c57385b2bc1068c618f206bfeb6513d3d8c4..c8d568c8c2cf73041549a138085b72b41c0c297a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -100,6 +100,8 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, ops::ReduceMeanDoubleGradOpBaseMaker, ops::ReduceMeanGradNoNeedBufferVarInferer); REGISTER_OP_CPU_KERNEL(reduce_mean, + ops::ReduceKernel, ops::ReduceKernel, ops::ReduceKernel; -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel, CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu index cc3653fcb43a4c000d0c61c9d854965fafd59a9c..50d2fcdee23bd9e830f32e0cff4d367c3ad5ba66 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu @@ -65,5 +65,6 @@ class ReduceMeanKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, +REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, + ops::ReduceMeanKernel, ops::ReduceMeanKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index 289f574719ff03b1b09f313d05bab152f5c5d651..0e133d5447f93b8891c6de4cb5ad40ac7825493b 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -20,5 +20,6 @@ using CUDAReduceMeanGradKernel = ops::ReduceGradKernel; -REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, +REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu index 7b2706866f594228cbceb084e99d83aa8f345dfd..7806df284d8c06d60a26698679b875a8cb9f7844 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_min_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op.cu @@ -11,15 +11,13 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" - -REGISTER_OP_CUDA_KERNEL(reduce_min, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +// reduce_min +REGISTER_OP_CUDA_KERNEL( + reduce_min, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h new file mode 100644 index 0000000000000000000000000000000000000000..61efa409b90c3ed7bcedffbd08896ab13ec2b74c --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -0,0 +1,848 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif + +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/framework/array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/fast_divmod.h" + +// Reduce split or not, Whether to use ReduceHigherDim +#define REDUCE_SPLIT_BOUNDARY 512 +#define REDUCE_VEC_SIZE 4 + +namespace paddle { +namespace operators { +namespace detail { + +// Post processing function for sum, max, min, prod, any +template +struct IdentityFunctor { + HOSTDEVICE explicit inline IdentityFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x); + } +}; + +// Post processing function for mean +template +struct DivideFunctor { + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } + + private: + T n_inv; +}; + +static inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + +static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } + +// get strides of x_dim, reduce_dim and left_dim for reduceLastDim and reduceAny +static inline std::vector GetDimStrides(const std::vector& dims, + const std::vector& idx) { + int n = static_cast(idx.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[idx[i + 1]]; + } + return strides; +} + +#ifdef __HIPCC__ +constexpr int kMaxThread = 256; +constexpr int kWarpSize = 64; +#else +constexpr int kMaxThread = 128; +constexpr int kWarpSize = 32; +#endif + +// get blockDim for reduceLastDim and reduceAny +static inline int GetBlockDim(int block_dim) { + return block_dim >= kMaxThread ? kMaxThread : GetLastPow2(block_dim); +} + +// check reduce rand is valid +static inline void CheckReduceRank(int reduce_rank, int rank) { + if (rank % 2 == 0) { + PADDLE_ENFORCE_EQ(reduce_rank, rank / 2, + platform::errors::InvalidArgument( + "ReduceOp: invalid reduce rank. When rank = %d, " + "reduce_rank must be %d, but got %d.", + rank, rank / 2, reduce_rank)); + } else { + auto lower_rank = (rank - 1) / 2; + auto upper_rank = (rank + 1) / 2; + PADDLE_ENFORCE_EQ( + reduce_rank == lower_rank || reduce_rank == upper_rank, true, + platform::errors::InvalidArgument( + "ReduceOp: invalid reduce rank. When rank = %d, reduce_rank " + "must be %d or %d, but got %d.", + rank, lower_rank, upper_rank, reduce_rank)); + } +} + +// convert dims from vector to array +template +static inline paddle::framework::Array VectorToArray( + const VectorLikeType& vec) { + PADDLE_ENFORCE_LE(vec.size(), ElementCount, + platform::errors::InvalidArgument( + "Cub reduce Array: size not match. Received " + "vec.size() %d > ElementCount %d.", + vec.size(), ElementCount)); + size_t n = static_cast(vec.size()); + paddle::framework::Array ret; + for (size_t i = 0; i < n; ++i) { + ret[i] = vec[i]; + } + return ret; +} + +} // namespace detail + +using Tensor = framework::Tensor; +constexpr int kMaxRank = framework::DDim::kMaxRank; + +enum ReduceType { + kReduceAll = 0x00, // when reduce_rank == x_rank + kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; + kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim + kReduceAny = 0x03, // when reduce_dim.size() > 1 +}; + +struct IndexCalculator { + IndexCalculator(int dim, const std::vector& cal_dims, + const std::vector& cal_strides, + const std::vector& full_strides) + : dim(dim) { + dims = detail::VectorToArray(cal_dims); + strides = detail::VectorToArray(full_strides); + std::vector cal_divmoders; + // fast divmod + for (auto i : cal_strides) { + cal_divmoders.push_back(FastDivMod(i)); + } + divmoders = detail::VectorToArray(cal_divmoders); + } + + __device__ inline int Get(int offset) const { + int index = 0; +#pragma unroll + for (int i = 0; i < kMaxRank; ++i) { + if (i == dim) { + break; + } + auto divmod = divmoders[i].Divmod(offset); + index += (divmod.val[0] * strides[dims[i]]); + offset = divmod.val[1]; + } + return index; + } + + int dim; + framework::Array dims; + framework::Array strides; + framework::Array divmoders; +}; + +// reduce config +template +struct ReduceConfig { + ReduceConfig(const std::vector& origin_reduce_dims, + const std::vector& origin_x_dim) + : reduce_dims_origin(origin_reduce_dims), x_dim(origin_x_dim) {} + + // get the parameters of reduceKernel + void Run() { + // step1: update the reduce_dim left_dim and x_dim + SetReduceDim(); + + // step2: get the strides of dim for reduceAny and reduceLastDim + SetStrides(); + + // step3: get the type of reduce + SetReduceType(); + + // step4: set the block and grid for launch kernel + SetBlockDim(); + } + + // when should_reduce_again is true, we need malloc temp space for temp data + void SetOutputData(Ty* y_data, const platform::Place& place, + framework::Tensor* tmp) { + if (should_reduce_again) { + output_data = tmp->mutable_data( + framework::make_ddim( + {static_cast(left_num * grid.z * grid.y * sizeof(Ty))}), + place); + } else { + output_data = y_data; + } + } + + private: + // set reduce_dim, left_dim and update x_dim + // eg: x_dim = [2, 4, 6] origin_reduce_dims = [0, 1] + // --SetReduceDim--> x_dim = [8,6], reduce_dim = [0], left_dim = [1] + void SetReduceDim() { + std::set reduce_set; + for (auto e : reduce_dims_origin) { + auto pos = e >= 0 ? e : e + x_dim.size(); + reduce_set.insert(pos); + } + + std::vector reduce_dim_temp(reduce_set.begin(), reduce_set.end()); + std::sort(reduce_dim_temp.begin(), reduce_dim_temp.end()); + + // update reduce_dim and x_dim + std::vector x_new_dim; + + reduce_dim.push_back(reduce_dim_temp[0]); + x_new_dim.push_back(x_dim[0]); + + int idx_reduce = 1; + int num = 0; + + if (reduce_dim_temp.size() > 1) { + for (int i = 1; i < x_dim.size(); i++) { + if ((idx_reduce < reduce_dim_temp.size()) && + (i == reduce_dim_temp[idx_reduce])) { + int result = + reduce_dim_temp[idx_reduce] - reduce_dim[reduce_dim.size() - 1]; + bool is_equal = ((result - num) == 1); + if (is_equal) { + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; + num++; + } else { + reduce_dim.push_back(reduce_dim_temp[idx_reduce] - num); + x_new_dim.push_back(x_dim[i]); + } + idx_reduce++; + } else { + x_new_dim.push_back(x_dim[i]); + } + } + } else { + x_new_dim = x_dim; + } + + // update x_dim + x_dim = x_new_dim; + std::vector().swap(x_new_dim); + + std::vector reduce_dim_new; + int is_reduced = 0; + for (auto e : reduce_dim) { + is_reduced |= 1 << e; + } + + std::vector().swap(reduce_dim); + + for (int i = 0; i < x_dim.size(); i++) { + if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { + x_new_dim.push_back(x_dim[i]); + if ((is_reduced >> i) & 1) + reduce_dim_new.push_back(x_new_dim.size() - 1); + } else { + x_new_dim[x_new_dim.size() - 1] *= x_dim[i]; + } + } + + x_dim = x_new_dim; + reduce_dim = reduce_dim_new; + + int x_rank = static_cast(x_dim.size()); + std::set left_set; + + for (int i = 0; i < x_rank; ++i) { + left_set.insert(i); + } + + for (auto e : reduce_dim) { + left_set.erase(e); + } + + left_dim.assign(left_set.begin(), left_set.end()); + + // if the last dim gets involved in reduction + reduce_lastdim = (reduce_dim.back() == x_dim.size() - 1); + } + + // set x_strides, reduce_strides, left_strides for reduceLastDim and reduceAny + // eg: x_dim = [8, 6], reduce_dim = [0], left_dim = [1] + // --SetStrides--> x_strides= [6,1], reduce_strides = [1], + // left_strides = [1] + void SetStrides() { + std::vector idx_dim; + for (int i = 0; i < x_dim.size(); i++) { + idx_dim.push_back(i); + } + + x_strides = detail::GetDimStrides(x_dim, idx_dim); + reduce_strides = detail::GetDimStrides(x_dim, reduce_dim); + left_strides = detail::GetDimStrides(x_dim, left_dim); + reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; + + left_num = 1; + if (left_dim.size()) { + left_num = left_strides[0] * x_dim[left_dim[0]]; + } + } + + // get the reduceType + // eg: x_dim = [8, 6] reduce_dim = [0] --> ReduceHigherDim -->reduceFirstDim + // x_dim = [8, 6] reduce_dim = [1] --> reduceLastDim + // x_dim = [8] reduce_dim = [0] --> reduceAll + // x_dim = [8, 6, 4, 2] reduce_dim = [0, 2] --> reduceAny + void SetReduceType() { + int rank = x_dim.size(); + int reduce_rank = reduce_dim.size(); + bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) || + (left_num > REDUCE_SPLIT_BOUNDARY); + + if (rank == reduce_rank) { + reduce_type = static_cast(ReduceType::kReduceAll); + } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { + reduce_type = static_cast(ReduceType::kReduceLastDim); + } else if (reduce_rank == 1 && + ((rank == 2 && is_large_enough) || rank != 2)) { + // ReduceFirstDim and reduceSecondDim + reduce_type = static_cast(ReduceType::kReduceHigherDim); + } else { + reduce_type = static_cast(ReduceType::kReduceAny); + } + } + + void SetBlockDimForReduceAny(dim3* block_dim, dim3* grid_dim) { + constexpr int min_reduce_num_per_thread = 16; + constexpr int max_reduce_num_per_thread = 256; + constexpr int max_num_threads = detail::kMaxThread; + + // set block size. + // 1. if reduce_lastdim == true, block is 1-D, no need reduction in block y; + // 2. if reduce_lastdim == false, block is 2-D, if it is necessary, + // it should reduce in block y. + int grid_num, reduce_num_per_thread; + if (reduce_lastdim) { + block_dim->x = detail::GetBlockDim(reduce_num); + block_dim->y = 1; + grid_num = left_num; + reduce_num_per_thread = + detail::AlignUp(reduce_num, block_dim->x * block_dim->y); + } else { + int block_x = detail::GetBlockDim(left_num); + int block_y = detail::GetBlockDim(reduce_num); + block_dim->x = std::min(block_x, 32); + block_dim->y = + std::min(block_y, static_cast(max_num_threads / block_dim->x)); + block_dim->x = + std::min(block_x, static_cast(max_num_threads / block_dim->y)); + grid_num = detail::AlignUp(left_num, block_dim->x); + reduce_num_per_thread = detail::AlignUp(reduce_num, block_dim->y); + } + int device_id = platform::GetCurrentDeviceId(); + int max_mp = platform::GetCUDAMultiProcessors(device_id); + int max_threads_per_mp = + platform::GetCUDAMaxThreadsPerMultiProcessor(device_id); + int max_threads = max_threads_per_mp * max_mp; + int num_threads = block_dim->x * block_dim->y; + int max_num_blocks = max_threads / num_threads; + + // set grid size. + // Whether to set grid.y larger than 1, there are 3 following rules: + // 1. The number that each thread process should no less than + // min_reduce_num_per_threadbut no more than max_reduce_num_per_thread; + // 2. It should maximize the utilization of SM. + // So we choose the minimum between input_split_num_1 and input_split_num_3 + // to make each thread process as mush data as possible. Meanwhile, + // the number cannot be larger than max_reduce_num_per_thread, so we + // choose the maximum between the result above and input_split_num_2. + int input_split_num_1 = + detail::AlignUp(reduce_num_per_thread, min_reduce_num_per_thread); + int input_split_num_2 = + detail::AlignUp(reduce_num_per_thread, max_reduce_num_per_thread); + int input_split_num_3 = detail::AlignUp(max_num_blocks, grid_num); + + grid_dim->x = grid_num; + grid_dim->y = std::max(std::min(input_split_num_1, input_split_num_3), + input_split_num_2); + // if grid.y > 1, we need launch reduce kernel again. + if (grid_dim->y > 1) { + should_reduce_again = true; + } + } + + // set block and grid for launch kernel + // for ReduceHigherDim: if block is enough -> splite reduce_num + // else init block(32, 1) grid(block_num, 1) + // for others: block(block_num, 1) , grid(left_num, 1) + void SetBlockDim() { + // init + int block_num = detail::GetBlockDim(reduce_num); + should_reduce_again = false; + + dim3 block_dim(block_num, 1); + dim3 grid_dim(left_num, 1); + blocking_size = reduce_num; + + if (reduce_type == ReduceType::kReduceHigherDim) { + int last_dim_num = x_dim.back(); + // update left_num + int grid_z = left_num / last_dim_num; + left_num = last_dim_num; + + block_dim.z = 1; + grid_dim.z = grid_z; + + int device_id = platform::GetCurrentDeviceId(); + int max_mp = platform::GetCUDAMultiProcessors(device_id); + int max_threads_per_mp = + platform::GetCUDAMaxThreadsPerMultiProcessor(device_id); + int max_threads = max_threads_per_mp * max_mp; + + // init + int num_block = (max_threads / left_num); + + if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) { + blocking_size = detail::GetLastPow2(reduce_num / num_block); + + if (blocking_size <= 1) { + blocking_size = detail::GetLastPow2(sqrt(reduce_num)); + } else if (blocking_size * 2 < reduce_num) { + blocking_size *= 2; + } + + should_reduce_again = true; + + block_dim.x = 32; + block_dim.y = 1; + grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x; + grid_dim.y = (reduce_num + blocking_size - 1) / blocking_size; + + } else { + block_dim.x = 32; + block_dim.y = 1; + blocking_size = reduce_num; + grid_dim.x = (left_num + block_dim.x - 1) / block_dim.x; + grid_dim.y = 1; + } + } else if (reduce_type == ReduceType::kReduceAny) { + SetBlockDimForReduceAny(&block_dim, &grid_dim); + } + + block = block_dim; + grid = grid_dim; + } + + public: + std::vector reduce_dims_origin; + std::vector reduce_dim; + std::vector x_dim; + std::vector left_dim; + std::vector x_strides; + std::vector left_strides; + std::vector reduce_strides; + + int reduce_type; + int reduce_num; + int left_num; + int blocking_size; + bool should_reduce_again; + bool reduce_lastdim; + + Ty* output_data; + + dim3 block; + dim3 grid; +}; + +static __device__ int SharedMemoryIndex(int index) { + return (threadIdx.y + index) * blockDim.x + threadIdx.x; +} + +template +static __device__ T WarpReduce(T val, ReduceOp reducer) { + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + for (int stride = detail::kWarpSize / 2; stride > 0; stride >>= 1) { + T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride); + val = reducer(val, temp); + } + return val; +} + +/* e.g. + * |---------block---------| + * |warp0|warp1|warp2|warp3| + * |0~31|32~63|64~95|96~127| ---->blockDim.x = 128 + * \|/ \|/ \|/ \|/ ---->1. First WarpReduce in each warp + * res0 res1 res2 res3 ---->2. Store result of each warp to shared memory + * \ \ / / ---->3. Load the result above from shared memory + * res to warp0 and process the second WarpReduce + */ +template +static __device__ T BlockXReduce(T val, ReduceOp reducer) { + using detail::kWarpSize; + __shared__ T shared[kWarpSize]; + int block_dim_x = blockDim.x; + if (blockDim.x > kWarpSize) { + block_dim_x = blockDim.x / kWarpSize; + int lane = threadIdx.x % kWarpSize; + int wid = threadIdx.x / kWarpSize; + val = WarpReduce(val, reducer); + if (lane == 0) { + shared[wid] = val; + } + __syncthreads(); + val = shared[lane]; + } + + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + for (int stride = 1; stride < block_dim_x; stride <<= 1) { + T temp = paddle::platform::CudaShuffleDownSync(mask, val, stride); + val = reducer(val, temp); + } + return val; +} + +template +static __device__ T BlockYReduce(T val, ReduceOp reducer) { + __shared__ T shared_memory[detail::kMaxThread]; + shared_memory[SharedMemoryIndex(0)] = val; + for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) { + __syncthreads(); + if (threadIdx.y < stride && threadIdx.y + stride < blockDim.y) { + T temp = shared_memory[SharedMemoryIndex(stride)]; + val = reducer(val, temp); + } + shared_memory[SharedMemoryIndex(0)] = val; + } + return val; +} + +// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, this +// function will be used +// blockId.x -> left_num, threadId.x -> reduce_num +template +__device__ void ReduceLastDim(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, + int reduce_num) { + int idx_x = blockIdx.x * reduce_num; + int idx_y = threadIdx.x; + Ty reduce_var = init; + for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += blockDim.x) { + reduce_var = + reducer(reduce_var, static_cast(transformer(x[idx_x + idx_y]))); + } + __syncthreads(); + + reduce_var = BlockXReduce(reduce_var, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = reduce_var; + } +} + +// when reduce_dim.size() == 1 and reduce_dim[0] != x_dim.size() - 1, this +// function will be used +// eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 +// if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32 +// else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 +template +__device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, + int reduce_num, int left_num, int block_size) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + int idy = blockIdx.y * block_size; + + Ty reduce_var = init; + + if (idx < left_num) { + int loop = reduce_num - idy; + loop = loop > block_size ? block_size : loop; + + for (int iy = 0; iy < loop; iy++) { + int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num; + reduce_var = reducer(reduce_var, static_cast(transformer(x[id]))); + } + + y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] = + reduce_var; + } +} + +// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this +// function will be used +template +__device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, int reduce_num, + int left_num, bool reduce_lastdim, + const IndexCalculator& reduce_index_calculator, + const IndexCalculator& left_index_calculator) { + int input_idx, left_idx, stride; + // the last dim gets involved in reduction + if (reduce_lastdim) { + input_idx = blockIdx.y * blockDim.x + threadIdx.x; + left_idx = blockIdx.x; + stride = gridDim.y * blockDim.x; + } else { + input_idx = blockIdx.y * blockDim.y + threadIdx.y; + left_idx = blockIdx.x * blockDim.x + threadIdx.x; + stride = gridDim.y * blockDim.y; + } + // calculate the offset, means the addr where each thread really start. + int input_offset = left_index_calculator.Get(left_idx); + const Tx* input = x + input_offset; + Ty reduce_var = init; + + // 1. reduce for each thread + if (left_idx < left_num) { + // load REDUCE_VEC_SIZE data once, and then compute + Tx input_reg[REDUCE_VEC_SIZE]; + int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; + while (input_idx < bound) { +#pragma unroll + for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { + int reduce_idx = input_idx + i * stride; + int idx_x = reduce_index_calculator.Get(reduce_idx); + input_reg[i] = input[idx_x]; + } +#pragma unroll + for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { + reduce_var = reducer(reduce_var, transformer(input_reg[i])); + } + input_idx += REDUCE_VEC_SIZE * stride; + } + + // deal with the remain part + int input_idx_tmp = input_idx; +#pragma unroll + for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { + if (input_idx >= reduce_num) { + break; + } + int reduce_idx = input_idx; + int idx_x = reduce_index_calculator.Get(reduce_idx); + input_reg[i] = input[idx_x]; + input_idx += stride; + } + input_idx = input_idx_tmp; +#pragma unroll + for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { + if (input_idx >= reduce_num) { + break; + } + reduce_var = reducer(reduce_var, transformer(input_reg[i])); + input_idx += stride; + } + } + + // 2. reduce in block y + if (blockDim.y > 1) { + reduce_var = BlockYReduce(reduce_var, reducer); + } + __syncthreads(); + + if (reduce_lastdim) { + // 3. reduce in block x + reduce_var = BlockXReduce(reduce_var, reducer); + if (threadIdx.x == 0) { + y[blockIdx.x + blockIdx.y * gridDim.x] = reduce_var; + } + } else { + if (left_idx < left_num && threadIdx.y == 0) { + y[blockIdx.y * left_num + left_idx] = reduce_var; + } + } +} + +// module function designed for global function +template +__device__ void ReduceModule(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, int reduce_num, + int left_num, int blocking_size, int reduce_type, + bool reduce_lastdim, + const IndexCalculator& reduce_index_calculator, + const IndexCalculator& left_index_calculator) { + if (reduce_type == ReduceType::kReduceLastDim) { + ReduceLastDim(x, y, reducer, transformer, + init, reduce_num); + + // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1 + } else if (reduce_type == ReduceType::kReduceHigherDim) { + ReduceHigherDim( + x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); + + // reduce_rank >= 2 + } else { + ReduceAny( + x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim, + reduce_index_calculator, left_index_calculator); + } +} + +template +__global__ void ReduceKernelFunction(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, + int reduce_num, int left_num, + int blocking_size, int reduce_type, + bool reduce_lastdim, + IndexCalculator reduce_index_calculator, + IndexCalculator left_index_calculator) { + ReduceModule( + x, y, reducer, transformer, init, reduce_num, left_num, blocking_size, + reduce_type, reduce_lastdim, reduce_index_calculator, + left_index_calculator); +} + +template +static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, + const ReduceOp& reducer, Ty init, + gpuStream_t stream, ReduceConfig config) { + using TransformOp = typename ReduceOp::Transformer; + + int reduce_rank = config.reduce_strides.size(); + int left_rank = config.left_strides.size(); + auto reduce_index_calculator = IndexCalculator( + reduce_rank, config.reduce_dim, config.reduce_strides, config.x_strides); + auto left_index_calculator = IndexCalculator( + left_rank, config.left_dim, config.left_strides, config.x_strides); + + ReduceKernelFunction<<>>( + x_data, config.output_data, reducer, TransformOp(config.reduce_num), init, + config.reduce_num, config.left_num, config.blocking_size, + config.reduce_type, config.reduce_lastdim, reduce_index_calculator, + left_index_calculator); + + if (config.should_reduce_again) { + dim3 block; + dim3 grid; + if (config.reduce_lastdim) { + block = dim3(32, 1, 1); + grid = dim3(detail::AlignUp(config.left_num, 32), 1, 1); + } else { + block = dim3(config.block.x, 1, 1); + grid = dim3(config.grid.x, 1, config.grid.z); + } + + ReduceKernelFunction><<>>( + config.output_data, y_data, reducer, + detail::IdentityFunctor(config.grid.y), init, config.grid.y, + config.left_num, config.grid.y, ReduceType::kReduceHigherDim, + config.reduce_lastdim, reduce_index_calculator, left_index_calculator); + } +} + +template class ReduceOp> +void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, + gpuStream_t stream) { + auto x_dim = framework::vectorize(x.dims()); + auto config = ReduceConfig(origin_reduce_dims, x_dim); + config.Run(); // get the parameters of LaunchReduceKernel + + // after config.run() + // SetOutputData for ReduceHigherDim when should_reduce_again is true, + // temp_output should be stored temp_data in output_data space or stored in + // y_data; + framework::Tensor tmp; + auto x_data = x.data(); + auto y_data = y->mutable_data(x.place()); + + if (config.reduce_num == 1) { + auto out_dims = y->dims(); + framework::TensorCopy(x, y->place(), y); + y->Resize(out_dims); + return; + } + + config.SetOutputData(y_data, x.place(), &tmp); + + using TransformOp = typename ReduceOp::Transformer; + auto reducer = ReduceOp(); + // launch CUB::Reduce + if (config.reduce_type == static_cast(ReduceType::kReduceAll)) { + cub::TransformInputIterator trans_x( + x_data, TransformOp(config.reduce_num)); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, + config.reduce_num, reducer, reducer.initial(), + stream); + framework::Tensor tmp; + auto* temp_storage = tmp.mutable_data( + framework::make_ddim({static_cast(temp_storage_bytes)}), + x.place()); + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, + config.reduce_num, reducer, reducer.initial(), + stream); + + return; + } + + LaunchReduceKernel>( + x_data, y_data, reducer, reducer.initial(), stream, config); +} + +template class ReduceOp> +struct TensorReduceFunc { + const framework::Tensor& x; + framework::Tensor* y; + std::vector origin_reduce_dims; + gpuStream_t stream; + TensorReduceFunc(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, gpuStream_t stream) + : x(x), y(y), origin_reduce_dims(origin_reduce_dims), stream(stream) {} + + template + void apply() const { + TensorReduceFunctorImpl(x, y, origin_reduce_dims, stream); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 913d941df8810bc2906f305b6239444d1280a4ae..368fedececf53336edc7b67f932408d74994d760 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -23,6 +23,9 @@ limitations under the License. */ #include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" +#if defined(__HIPCC__) || defined(__NVCC__) +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" +#endif namespace paddle { namespace operators { @@ -60,6 +63,27 @@ inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims, } } +static inline std::vector GetReduceDim(const std::vector& dims, + int dim_size, bool reduce_all) { + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(dim_size); + int reduce_size = reduce_dims.size(); + for (int i = 0; i < reduce_size; ++i) { + reduce_dims[i] = i; + } + } else { + for (auto e : dims) { + PADDLE_ENFORCE_LT(e, dim_size, + paddle::platform::errors::InvalidArgument( + "ReduceOp: invalid axis, when x_dims is %d, " + "axis[i] should less than x_dims, but got %d.", + dim_size, e)); + reduce_dims.push_back(e >= 0 ? e : e + dim_size); + } + } + return reduce_dims; +} template void GetShuffledInput(const framework::ExecutionContext& context, const Tensor* input, Tensor* shuffled_input, @@ -308,6 +332,7 @@ class BoolReduceKernel : public framework::OpKernel { } } }; + template class ReduceGradKernel : public framework::OpKernel { @@ -559,8 +584,11 @@ class ReduceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")); + int in_dtype = ctx.Attr("in_dtype"); + auto input_data_type = + (in_dtype >= 0) ? static_cast(in_dtype) + : OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); #ifdef PADDLE_WITH_MKLDNN auto CanMKLDNNReduceGradBeUsed = [&]() { @@ -568,18 +596,6 @@ class ReduceGradOp : public framework::OperatorWithKernel { if (dx_dims.size() > 5) return false; // max 5D tensor is supported - if (ctx.Attr("reduce_all") || - ((int)ctx.Attr>("dim").size() == dx_dims.size())) - return true; - - auto dy_dims = ctx.Input(framework::GradVarName("Out"))->dims(); - - // Subtensor must be on rightmost part of the bigger tensor - for (int i = 0; i < dy_dims.size(); ++i) { - if (dx_dims[dx_dims.size() - dy_dims.size() + i] != dy_dims[i]) { - return false; - } - } return true; }; if (this->CanMKLDNNBeUsed(ctx, input_data_type) && @@ -590,12 +606,6 @@ class ReduceGradOp : public framework::OperatorWithKernel { } #endif - int in_dtype = ctx.Attr("in_dtype"); - if (in_dtype >= 0) { - return framework::OpKernelType( - static_cast(in_dtype), - ctx.GetPlace()); - } return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -651,6 +661,33 @@ If reduce_all is true, just reduce along all dimensions and output a scalar. virtual std::string GetOpType() const = 0; }; +#if defined(__HIPCC__) || defined(__NVCC__) +template class ReduceOp> +class ReduceCudaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + const Tensor* input = context.Input("X"); + Tensor* output = context.Output("Out"); + auto out_dtype = context.Attr("out_dtype"); + std::vector dims = context.Attr>("dim"); + + std::vector reduce_dims = + GetReduceDim(dims, input->dims().size(), reduce_all); + + gpuStream_t stream = context.cuda_device_context().stream(); + if (out_dtype >= 0) { + framework::VisitDataTypeSmall( + static_cast(out_dtype), + TensorReduceFunc(*input, output, reduce_dims, stream)); + } else { + TensorReduceFunctorImpl(*input, output, reduce_dims, + stream); + } + } +}; +#endif + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu index 44e76c78b1f3e337c59cfbc50f4393d91f22d3df..317a6e1d93c2e8981bd7a54b6e4d64ccd53b9928 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cu @@ -12,26 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" -#ifdef __HIPCC__ -// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922 -// do not support double in HIPCC platform (Eigen3 to be fixed) -REGISTER_OP_CUDA_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); -#else -REGISTER_OP_CUDA_KERNEL(reduce_prod, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); -#endif +REGISTER_OP_CUDA_KERNEL( + reduce_prod, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index 5a8e8894e1c5da8e0d34f15f2e402b7ecbbea364..9e4cc8e213c61e8d2dd4e6f07dab92cf217ce688 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -109,17 +109,21 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, ops::ReduceSumGradNoNeedBufferVarInferer); REGISTER_OP_CPU_KERNEL( - reduce_sum, ops::ReduceKernel, + ops::ReduceKernel, ops::ReduceKernel, + ops::ReduceKernel, ops::ReduceKernel, ops::ReduceKernel, ops::ReduceKernel, + paddle::platform::complex, ops::SumFunctor>, ops::ReduceKernel, ops::SumFunctor>); @@ -128,9 +132,10 @@ using CPUReduceSumGradKernel = ops::ReduceSumGradKernel; -REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel, - CPUReduceSumGradKernel); +REGISTER_OP_CPU_KERNEL( + reduce_sum_grad, CPUReduceSumGradKernel, + CPUReduceSumGradKernel, CPUReduceSumGradKernel, + CPUReduceSumGradKernel, + CPUReduceSumGradKernel, CPUReduceSumGradKernel, + CPUReduceSumGradKernel>, + CPUReduceSumGradKernel>); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu index 219cc231a1ea7a0786026d6dcc6d63ce78e24025..efbafe4aa8c3e0f538b972c5f1b2f8f83e11d4a6 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu @@ -18,11 +18,14 @@ namespace paddle { namespace operators { -template +template struct IdentityFunctor { HOSTDEVICE explicit inline IdentityFunctor() {} - HOSTDEVICE inline T operator()(const T& x) const { return x; } + template + HOSTDEVICE inline Tout operator()(const U& x) const { + return static_cast(x); + } }; template @@ -56,9 +59,9 @@ class ReduceSumKernel : public framework::OpKernel { if (out_dtype >= 0) { framework::VisitDataTypeSmall( static_cast(out_dtype), - TensorReduceFunctor>( + TensorReduceFunctor( *input, output, reduce_dims, static_cast(0.0), cub::Sum(), - IdentityFunctor(), stream)); + stream)); } else { TensorReduce>( *input, output, reduce_dims, static_cast(0), cub::Sum(), @@ -70,8 +73,10 @@ class ReduceSumKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, - ops::ReduceSumKernel, ops::ReduceSumKernel, - ops::ReduceSumKernel, - ops::ReduceSumKernel, - ops::ReduceSumKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_sum, ops::ReduceSumKernel, ops::ReduceSumKernel, + ops::ReduceSumKernel, + ops::ReduceSumKernel, ops::ReduceSumKernel, + ops::ReduceSumKernel, + ops::ReduceSumKernel>, + ops::ReduceSumKernel>); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index f2bee6dddc39ec965966e4964c954e5fb1441bf5..419b8ce276526ba225782660b6c096284ae1d416 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -20,9 +20,10 @@ using CUDAReduceSumGradKernel = ops::ReduceGradKernel; -REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel, - CUDAReduceSumGradKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_sum_grad, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel>, + CUDAReduceSumGradKernel>); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc index f3b6e69a48bcb05563bc141e59863f95d6c17e30..78bd42ff00c83f409d1ec3d094ab8a03a2a68eb2 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc @@ -51,7 +51,7 @@ class ReduceSumNPUKernel : public framework::OpKernel { cast_x.Resize(x->dims()); cast_x.mutable_data(ctx.GetPlace()); auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::FP32); - auto runner_cast = NpuOpRunner( + const auto& runner_cast = NpuOpRunner( "Cast", {*x}, {cast_x}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast.Run(stream); @@ -68,20 +68,22 @@ class ReduceSumNPUKernel : public framework::OpKernel { dim_vec.push_back(i); } - auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, - {{"axes", dim_vec}, {"keep_dims", keep_dims}}); + const auto& runner = + NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, + {{"axes", dim_vec}, {"keep_dims", keep_dims}}); runner.Run(stream); } else { - auto runner = NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, - {{"axes", dims}, {"keep_dims", keep_dims}}); + const auto& runner = + NpuOpRunner("ReduceSumD", {cast_x}, {cast_out}, + {{"axes", dims}, {"keep_dims", keep_dims}}); runner.Run(stream); } if (x->type() != framework::proto::VarType::FP32 && x->type() != framework::proto::VarType::FP16) { auto dst_dtype = ConvertToNpuDtype(out->type()); - auto runner_cast = + const auto& runner_cast = NpuOpRunner("Cast", {cast_out}, {*out}, {{"dst_type", static_cast(dst_dtype)}}); runner_cast.Run(stream); @@ -107,8 +109,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); if (keep_dims || reduce_all) { - auto runner = NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad}, - {{"shape", framework::vectorize(x->dims())}}); + const auto& runner = + NpuOpRunner("BroadcastToD", {*out_grad}, {*x_grad}, + {{"shape", framework::vectorize(x->dims())}}); runner.Run(stream); } else { framework::DDim out_dims; @@ -124,8 +127,9 @@ class ReduceSumGradNPUKernel : public framework::OpKernel { &out_grad_tmp); out_grad_tmp.Resize(out_dims); - auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad}, - {{"shape", framework::vectorize(x->dims())}}); + const auto& runner = + NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad}, + {{"shape", framework::vectorize(x->dims())}}); runner.Run(stream); } } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index e119a21caa23cb937894031a3abec7c33b843615..717029cb8f11733ff03c54949554b91ed1ffe09c 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -613,23 +613,24 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR( reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t, ops::ReshapeKernel, uint8_t, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel, bool, ops::ReshapeKernel, - paddle::platform::bfloat16, ops::ReshapeKernel, paddle::platform::complex64, - ops::ReshapeKernel, paddle::platform::complex128, ops::ReshapeKernel); + paddle::platform::bfloat16, ops::ReshapeKernel, + paddle::platform::complex, ops::ReshapeKernel, + paddle::platform::complex, ops::ReshapeKernel); REGISTER_OP_CPU_KERNEL_FUNCTOR( reshape2_grad, float, ops::ReshapeGradKernel, double, ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t, ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, paddle::platform::bfloat16, ops::ReshapeGradKernel, - paddle::platform::complex64, ops::ReshapeGradKernel, - paddle::platform::complex128, ops::ReshapeGradKernel); + paddle::platform::complex, ops::ReshapeGradKernel, + paddle::platform::complex, ops::ReshapeGradKernel); REGISTER_OP_CPU_KERNEL_FUNCTOR( reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double, ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t, ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, bool, ops::ReshapeDoubleGradKernel, paddle::platform::bfloat16, - ops::ReshapeDoubleGradKernel, paddle::platform::complex64, - ops::ReshapeDoubleGradKernel, paddle::platform::complex128, + ops::ReshapeDoubleGradKernel, paddle::platform::complex, + ops::ReshapeDoubleGradKernel, paddle::platform::complex, ops::ReshapeDoubleGradKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -650,22 +651,23 @@ REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, uint8_t, ops::ReshapeKernel, int64_t, ops::ReshapeKernel, plat::float16, ops::ReshapeKernel, bool, ops::ReshapeKernel, - plat::complex64, ops::ReshapeKernel, - plat::complex128, ops::ReshapeKernel); + plat::complex, ops::ReshapeKernel, + plat::complex, ops::ReshapeKernel); REGISTER_OP_CUDA_KERNEL_FUNCTOR( reshape2_grad, float, ops::ReshapeGradKernel, double, ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, uint8_t, ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16, - ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex64, - ops::ReshapeGradKernel, plat::complex128, ops::ReshapeGradKernel); + ops::ReshapeGradKernel, bool, ops::ReshapeGradKernel, plat::complex, + ops::ReshapeGradKernel, plat::complex, ops::ReshapeGradKernel); REGISTER_OP_CUDA_KERNEL_FUNCTOR( reshape2_grad_grad, float, ops::ReshapeDoubleGradKernel, double, ops::ReshapeDoubleGradKernel, int, ops::ReshapeDoubleGradKernel, uint8_t, ops::ReshapeDoubleGradKernel, int64_t, ops::ReshapeDoubleGradKernel, plat::float16, ops::ReshapeDoubleGradKernel, bool, - ops::ReshapeDoubleGradKernel, plat::complex64, ops::ReshapeDoubleGradKernel, - plat::complex128, ops::ReshapeDoubleGradKernel); + ops::ReshapeDoubleGradKernel, plat::complex, + ops::ReshapeDoubleGradKernel, plat::complex, + ops::ReshapeDoubleGradKernel); #endif #ifdef PADDLE_WITH_XPU @@ -673,14 +675,14 @@ REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, int64_t, ops::ReshapeKernel, plat::float16, ops::ReshapeKernel, bool, ops::ReshapeKernel, - plat::complex64, ops::ReshapeKernel, - plat::complex128, ops::ReshapeKernel); + plat::complex, ops::ReshapeKernel, + plat::complex, ops::ReshapeKernel); REGISTER_OP_XPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel, double, ops::ReshapeGradKernel, int, ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel, plat::float16, ops::ReshapeGradKernel, bool, - ops::ReshapeGradKernel, plat::complex64, - ops::ReshapeGradKernel, plat::complex128, + ops::ReshapeGradKernel, plat::complex, + ops::ReshapeGradKernel, plat::complex, ops::ReshapeGradKernel); #endif diff --git a/paddle/fluid/operators/reverse_op.cc b/paddle/fluid/operators/reverse_op.cc index 8b2b9f464b407ba27333e354854a70a233986853..98a1610be607e8bcd6d14a25a45d1856a64dbe8a 100644 --- a/paddle/fluid/operators/reverse_op.cc +++ b/paddle/fluid/operators/reverse_op.cc @@ -145,4 +145,12 @@ REGISTER_OP_CPU_KERNEL( ops::ReverseKernel, ops::ReverseKernel, ops::ReverseKernel, - ops::ReverseKernel) + ops::ReverseKernel); + +REGISTER_OP_CUDA_KERNEL( + reverse, ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel, + ops::ReverseKernel); diff --git a/paddle/fluid/operators/reverse_op.h b/paddle/fluid/operators/reverse_op.h index 2813f7a4864a9ee84cefd8c824ee6f277b192dec..bf91e2f57a6676da7fca0a89564e59d99dd72981 100644 --- a/paddle/fluid/operators/reverse_op.h +++ b/paddle/fluid/operators/reverse_op.h @@ -16,6 +16,7 @@ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -23,7 +24,7 @@ template struct ReverseFunctor { void operator()(const DeviceContext& context, const framework::LoDTensor& in, framework::LoDTensor* out, const std::vector& axis) { - Eigen::array reverse_axis; + Eigen::DSizes reverse_axis; for (int i = 0; i < Rank; ++i) { reverse_axis[i] = false; } @@ -37,9 +38,10 @@ struct ReverseFunctor { auto in_eigen = framework::EigenTensor::From(in); auto out_eigen = framework::EigenTensor::From(*out); - auto* dev = context.eigen_device(); + auto& dev = *context.eigen_device(); - out_eigen.device(*dev) = in_eigen.reverse(reverse_axis); + EigenReverse, T, Rank>::Eval( + dev, out_eigen, in_eigen, reverse_axis); } }; diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc index 2be59c620441d6b3674b02373acc44e54751a50e..07329a9175e525cfb023737b340400e0400b5ff9 100644 --- a/paddle/fluid/operators/rnn_op.cu.cc +++ b/paddle/fluid/operators/rnn_op.cu.cc @@ -29,15 +29,21 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; +#ifdef PADDLE_WITH_HIP +using gpuRNNMode_t = miopenRNNMode_t; +using gpuDnnHandle_t = miopenHandle_t; +using gpuDnnDataType_t = miopenDataType_t; +#else +using gpuRNNMode_t = cudnnRNNMode_t; +using gpuDnnHandle_t = cudnnHandle_t; +using gpuDnnDataType_t = cudnnDataType_t; +#endif + class RNNDescriptors { public: RNNDescriptors(int seq_length, int batch_size, int input_size, int hidden_size, int num_layers, float dropout_prob, int seed, -#ifdef PADDLE_WITH_HIP - int weight_numel, miopenRNNMode_t mode, bool is_bidirec, -#else - int weight_numel, cudnnRNNMode_t mode, bool is_bidirec, -#endif + int weight_numel, gpuRNNMode_t mode, bool is_bidirec, bool is_test) : seq_length_(seq_length), batch_size_(batch_size), @@ -49,23 +55,14 @@ class RNNDescriptors { weight_numel_(weight_numel), mode_(mode), is_bidirec_(is_bidirec), - is_test_(is_test) { - } + is_test_(is_test) {} template -#ifdef PADDLE_WITH_HIP - void Create(const miopenHandle_t &handle, const platform::Place &place, -#else - void Create(const cudnnHandle_t &handle, const platform::Place &place, -#endif + void Create(const gpuDnnHandle_t &handle, const platform::Place &place, const std::vector &sequence_length, size_t *workspace_size, size_t *reserve_size, framework::Tensor *dropout_state) { int numDirections = is_bidirec_ ? 2 : 1; -#ifdef PADDLE_WITH_HIP - miopenDataType_t cudnn_type = platform::CudnnDataType::type; -#else - cudnnDataType_t cudnn_type = platform::CudnnDataType::type; -#endif + gpuDnnDataType_t cudnn_type = platform::CudnnDataType::type; // ------------------- cudnn x, y descriptors --------------------- std::vector dims_x = {batch_size_, input_size_, 1}; std::vector strides_x = {input_size_, 1, 1}; @@ -215,11 +212,7 @@ class RNNDescriptors { float dropout_prob_; int seed_; int weight_numel_; -#ifdef PADDLE_WITH_HIP - miopenRNNMode_t mode_; -#else - cudnnRNNMode_t mode_; -#endif + gpuRNNMode_t mode_; bool is_bidirec_; bool is_test_; #ifdef PADDLE_WITH_HIP @@ -296,6 +289,105 @@ void weight_to_tensor_list(const platform::Place &place, gpuStream_t stream, } } +#ifdef PADDLE_WITH_HIP +template +void weight_list_to_tensor(const platform::Place &place, gpuStream_t stream, + const std::vector &tensor_list, + Tensor *weight_whole, const size_t offset = 0UL) { + size_t weight_offset = offset; + auto weight_data = weight_whole->data(); + + for (size_t i = 0; i < tensor_list.size(); ++i) { + const T *in_data = tensor_list[i].data(); + auto in_size = tensor_list[i].numel(); + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight_whole->place()), + weight_data + weight_offset, + BOOST_GET_CONST(platform::CUDAPlace, tensor_list[i].place()), + in_data, in_size * sizeof(T), stream); + weight_offset += in_size; + } +} + +template +void weight_to_permuted_tensor(const platform::Place &place, gpuStream_t stream, + std::vector *weight_list, + Tensor *weight_whole, + const gpuRNNMode_t rnn_mode, + const bool is_bidirec) { + if (is_bidirec) { + for (size_t i = 0; i < weight_list->size(); i += 4) { + auto tmp = (*weight_list)[i + 1]; + (*weight_list)[i + 1] = (*weight_list)[i + 2]; + (*weight_list)[i + 2] = tmp; + } + } + size_t weight_offset = 0; + for (size_t i = 0; i < weight_list->size(); ++i) { + if (rnn_mode == miopenLSTM) { + std::vector split_tensor = (*weight_list)[i]->Chunk(4, 0); + weight_list_to_tensor( + place, stream, + {split_tensor[0], split_tensor[1], split_tensor[3], split_tensor[2]}, + weight_whole, weight_offset); + } else if (rnn_mode == miopenGRU) { + std::vector split_tensor = (*weight_list)[i]->Chunk(3, 0); + weight_list_to_tensor( + place, stream, {split_tensor[1], split_tensor[0], split_tensor[2]}, + weight_whole, weight_offset); + } else { + weight_list_to_tensor(place, stream, {*(*weight_list)[i]}, + weight_whole, weight_offset); + } + weight_offset += (*weight_list)[i]->numel(); + } +} + +template +void tensor_to_permuted_weight(const platform::Place &place, gpuStream_t stream, + const Tensor &tensor, + std::vector *weight_grad_list, + const gpuRNNMode_t rnn_mode, + const bool is_bidirec) { + if (is_bidirec) { + for (size_t i = 0; i < weight_grad_list->size(); i += 4) { + auto tmp = (*weight_grad_list)[i + 1]; + (*weight_grad_list)[i + 1] = (*weight_grad_list)[i + 2]; + (*weight_grad_list)[i + 2] = tmp; + } + } + size_t weight_offset = 0; + for (size_t i = 0; i < weight_grad_list->size(); ++i) { + auto numel_size = (*weight_grad_list)[i]->numel(); + Tensor temp; + temp.mutable_data({numel_size}, place); + temp.ShareDataWith(tensor.Slice(weight_offset, weight_offset + numel_size)); + + if (rnn_mode == miopenLSTM) { + std::vector split_tensor = temp.Chunk(4, 0); + weight_list_to_tensor( + place, stream, + {split_tensor[0], split_tensor[1], split_tensor[3], split_tensor[2]}, + (*weight_grad_list)[i]); + } else if (rnn_mode == miopenGRU) { + std::vector split_tensor = temp.Chunk(3, 0); + weight_list_to_tensor( + place, stream, {split_tensor[1], split_tensor[0], split_tensor[2]}, + (*weight_grad_list)[i]); + } else { + weight_list_to_tensor(place, stream, {temp}, (*weight_grad_list)[i]); + } + weight_offset += numel_size; + } + if (is_bidirec) { + for (size_t i = 0; i < weight_grad_list->size(); i += 4) { + auto tmp = (*weight_grad_list)[i + 1]; + (*weight_grad_list)[i + 1] = (*weight_grad_list)[i + 2]; + (*weight_grad_list)[i + 2] = tmp; + } + } +} +#endif + template class RNNCudnnKernel : public framework::OpKernel { public: @@ -314,7 +406,7 @@ class RNNCudnnKernel : public framework::OpKernel { int num_layers = ctx.Attr("num_layers"); auto mode = ctx.Attr("mode"); #ifdef PADDLE_WITH_HIP - miopenRNNMode_t rnn_mode = miopenLSTM; + gpuRNNMode_t rnn_mode = miopenLSTM; if (mode == "LSTM") rnn_mode = miopenLSTM; else if (mode == "GRU") @@ -324,7 +416,7 @@ class RNNCudnnKernel : public framework::OpKernel { else if (mode == "RNN_TANH") rnn_mode = miopenRNNTANH; #else - cudnnRNNMode_t rnn_mode = CUDNN_LSTM; + gpuRNNMode_t rnn_mode = CUDNN_LSTM; if (mode == "LSTM") rnn_mode = CUDNN_LSTM; else if (mode == "GRU") @@ -373,6 +465,11 @@ class RNNCudnnKernel : public framework::OpKernel { } bool has_seq_length = ctx.HasInput("SequenceLength"); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_EQ(has_seq_length, false, + platform::errors::InvalidArgument( + "ROCm do not support SequenceLength yet.")); +#endif std::vector SequenceLength; if (has_seq_length) { auto *sequence_length = ctx.Input("SequenceLength"); @@ -400,14 +497,26 @@ class RNNCudnnKernel : public framework::OpKernel { [](int64_t num, const Tensor *t) { return num + t->numel(); }); bool continuous = is_continuous>(weight_list); +#ifdef PADDLE_WITH_HIP + // Need to permute weight, set continuous to false + continuous = false; +#endif if (!continuous) { LOG_FIRST_N(WARNING, 2) << "If the memory space of the Input WeightList is not continuous, " "less efficient calculation will be called. Please call " "flatten_parameters() to make the input memory continuous."; weight_whole.mutable_data({weight_numel}, place); +#ifdef PADDLE_WITH_HIP + // MIOPEN need to permute weight for miopenLSTM or miopenGRU + weight_to_permuted_tensor(place, stream, &weight_list, &weight_whole, + rnn_mode, is_bidirec); +#else weight_to_tensor(place, stream, weight_list, &weight_whole); +#endif w_data = weight_whole.data(); +#ifndef PADDLE_WITH_HIP + // MIOPEN need to permute weight, do not share with weight_grad if (is_test) { // maybe also reset small weights' ptr for training int offset = 0; for (size_t i = 0; i < weight_list.size(); ++i) { @@ -421,6 +530,7 @@ class RNNCudnnKernel : public framework::OpKernel { offset += len; } } +#endif } else { w_data = const_cast(weight_list[0]->data()); } @@ -486,11 +596,7 @@ class RNNCudnnKernel : public framework::OpKernel { } } -#ifdef PADDLE_WITH_HIP - void RNNInferece(const bool &has_seq_length, const miopenHandle_t &handle, -#else - void RNNInferece(const bool &has_seq_length, const cudnnHandle_t &handle, -#endif + void RNNInferece(const bool &has_seq_length, const gpuDnnHandle_t &handle, const int &seq_length, RNNDescriptors *rnn, const T *x_data, const T *init_h_data, const T *init_c_data, const T *w_data, T *out_data, T *last_h_data, T *last_c_data, @@ -607,9 +713,20 @@ class RNNGradCudnnKernel : public framework::OpKernel { Tensor weight_whole; T *weight_data = nullptr; +#ifdef PADDLE_WITH_HIP + // Need to permute weight, set continuous to false + continuous = false; +#endif + if (!continuous) { weight_whole.mutable_data({weight_numel}, place); +#ifdef PADDLE_WITH_HIP + // MIOPEN need to permute weight for miopenLSTM or miopenGRU + weight_to_permuted_tensor(place, stream, &weight_list, &weight_whole, + rnn_mode, is_bidirec); +#else weight_to_tensor(place, stream, weight_list, &weight_whole); +#endif weight_data = weight_whole.data(); } else { weight_data = const_cast(weight_list[0]->data()); @@ -621,6 +738,13 @@ class RNNGradCudnnKernel : public framework::OpKernel { zero(dev_ctx, &weight_grad, static_cast(0.0)); T *weight_grad_data = weight_grad.data(); +#ifdef PADDLE_WITH_HIP + // MIOPEN need to permute weight_grad_list, so do not share data with + // weight_grad + for (size_t i = 0; i < weight_grad_list.size(); ++i) { + weight_grad_list[i]->mutable_data(ctx.GetPlace()); + } +#else int offset = 0; for (size_t i = 0; i < weight_grad_list.size(); ++i) { size_t len = weight_grad_list[i]->numel(); @@ -631,6 +755,7 @@ class RNNGradCudnnKernel : public framework::OpKernel { .Resize(dim); offset += len; } +#endif Tensor input_grad_value; if (!in_grad) { @@ -672,6 +797,11 @@ class RNNGradCudnnKernel : public framework::OpKernel { } bool has_seq_length = ctx.HasInput("SequenceLength"); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_EQ(has_seq_length, false, + platform::errors::InvalidArgument( + "ROCm do not support SequenceLength yet.")); +#endif std::vector SequenceLength; if (has_seq_length) { auto *sequence_length = ctx.Input("SequenceLength"); @@ -731,6 +861,9 @@ class RNNGradCudnnKernel : public framework::OpKernel { rnn.weight_desc(), weight_grad_data, workspace_data_.data(), workspace_size, const_cast(reserve_data), reserve_size)); + // permute weight grad list from weight grad tensor + tensor_to_permuted_weight(place, stream, weight_grad, + &weight_grad_list, rnn_mode, is_bidirec); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data(), diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index d6ba399439d0238f12797cc2a0ab90389225b7af..934802f6a9e0e9eec1e6492595c336a5ce3bd927 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -124,8 +124,10 @@ __global__ void GPUROIAlignForward( T roi_width = roi_xmax - roi_xmin; T roi_height = roi_ymax - roi_ymin; - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); @@ -138,7 +140,7 @@ __global__ void GPUROIAlignForward( : ceil(roi_height / pooled_height); int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = roi_bin_grid_h * roi_bin_grid_w; + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); T output_val = 0; for (int iy = 0; iy < roi_bin_grid_h; iy++) { const T y = roi_ymin + ph * bin_size_h + @@ -180,9 +182,10 @@ __global__ void GPUROIAlignBackward( T roi_width = roi_xmax - roi_xmin; T roi_height = roi_ymax - roi_ymin; - roi_width = max(roi_width, static_cast(1.)); - roi_height = max(roi_height, static_cast(1.)); - + if (!continuous_coordinate) { + roi_width = max(roi_width, static_cast(1.)); + roi_height = max(roi_height, static_cast(1.)); + } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h index 46564ed4f629d80a2ab1706b512598cf8dbe4a27..29c9268d5241cce8bfaad6a96950933f1b7a3280 100644 --- a/paddle/fluid/operators/roi_align_op.h +++ b/paddle/fluid/operators/roi_align_op.h @@ -226,8 +226,10 @@ class CPUROIAlignOpKernel : public framework::OpKernel { T roi_width = roi_xmax - roi_xmin; T roi_height = roi_ymax - roi_ymin; - roi_width = std::max(roi_width, static_cast(1.)); - roi_height = std::max(roi_height, static_cast(1.)); + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); @@ -239,7 +241,7 @@ class CPUROIAlignOpKernel : public framework::OpKernel { int roi_bin_grid_w = (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); - const T count = roi_bin_grid_h * roi_bin_grid_w; + const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); Tensor pre_pos; Tensor pre_w; int pre_size = count * out_stride[1]; @@ -362,6 +364,10 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel { T roi_height = roi_ymax - roi_ymin; roi_width = std::max(roi_width, static_cast(1.)); roi_height = std::max(roi_height, static_cast(1.)); + if (!aligned) { + roi_width = std::max(roi_width, static_cast(1.)); + roi_height = std::max(roi_height, static_cast(1.)); + } T bin_size_h = static_cast(roi_height) / static_cast(pooled_height); T bin_size_w = static_cast(roi_width) / static_cast(pooled_width); diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc index b1fe95203636fe96b0e45afdbf040402aa7e9718..a0c28ae6cba16defd47f3e332717dfd86808c735 100644 --- a/paddle/fluid/operators/roll_op.cc +++ b/paddle/fluid/operators/roll_op.cc @@ -13,8 +13,10 @@ // limitations under the License. #include "paddle/fluid/operators/roll_op.h" + #include #include + #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -37,12 +39,22 @@ class RollOp : public framework::OperatorWithKernel { auto dims = ctx->Attrs().Get>("axis"); auto shifts = ctx->Attrs().Get>("shifts"); - PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), - platform::errors::InvalidArgument( - "Attr(dims).size() should be equl to " - "Attr(shifts).size(). But received " - "Attr(dims).size() = %d, Attr(shifts).size() = %d", - dims.size(), shifts.size())); + if (dims.size() != 0) { + PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), + platform::errors::InvalidArgument( + "When dims.size() != 0, dims.size() " + "should be equal to " + "shifts.size(). But received " + "dims.size() = %d, shifts.size() = %d", + dims.size(), shifts.size())); + } else { + PADDLE_ENFORCE_EQ(shifts.size(), 1, + platform::errors::InvalidArgument( + "When dims.size() == 0, shifts.size() " + "should be equal to 1, But received " + "shifts.size() = %d", + shifts.size())); + } ctx->SetOutputDim("Out", ctx->GetInputDim("X")); auto type = ctx->GetInputsVarType("X")[0]; @@ -95,7 +107,7 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>( "axis", "Axis along which to roll. It must have the same size " - "with shifts.") + "with shifts or size == 0") .SetDefault({}); AddComment(R"DOC( Roll the tensor along the given dimension(s). @@ -151,8 +163,9 @@ REGISTER_OP_VERSION(roll) paddle::framework::compatible::OpVersionDesc() .NewAttr("axis", "(std::vector) Axis along which to roll. " - "It must have the same size with shifts.", + "It must have the same size with shifts, or size = 0.", std::vector()) - .DeleteAttr("dims", - "(std::vector) Dims along which to roll. " - "It must have the same size with shifts.")); + .DeleteAttr( + "dims", + "(std::vector) Dims along which to roll. " + "It must have the same size with shifts, or size = 0.")); diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu index 09309c492d29225cb2b0ed42559e43e73ea49c7f..34d4d67e39d53442a7a8d177292427a933e518b7 100644 --- a/paddle/fluid/operators/roll_op.cu +++ b/paddle/fluid/operators/roll_op.cu @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include "paddle/fluid/framework/array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/roll_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -24,26 +25,34 @@ using platform::PADDLE_CUDA_NUM_THREADS; using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template -__global__ void roll_cuda_kernel(const T* input, T* output, int64_t N, - int64_t* shifts, int64_t* strides, - int64_t* sizes, int64_t nums) { +template +__global__ void RollCudaKernel(const T* input, T* output, int64_t N, + paddle::framework::Array shifts, + paddle::framework::Array strides, + paddle::framework::Array sizes) { int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= N) { return; } + int64_t output_idx = idx; - int64_t dim_idx, dim_idx_shift; - for (int64_t i = 0; i < nums; i++) { - dim_idx = idx % (strides[i] * sizes[i]) / strides[i]; - dim_idx_shift = (dim_idx + shifts[i]) % sizes[i]; - output_idx = output_idx + (dim_idx_shift - dim_idx) * strides[i]; + int64_t new_dim_idx = 0; + +#pragma unroll + for (size_t i = 0; i < Rank; i++) { + new_dim_idx = (idx / strides[i]) % sizes[i] + shifts[i]; + if (new_dim_idx >= sizes[i]) { + output_idx += (shifts[i] - sizes[i]) * strides[i]; + } else { + output_idx += shifts[i] * strides[i]; + } } output[output_idx] = input[idx]; } -template -class RollCUDAKernel : public framework::OpKernel { +template +class RollKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); @@ -61,50 +70,62 @@ class RollCUDAKernel : public framework::OpKernel { auto input_dim = in->dims(); auto stride_dim = framework::stride(input_dim); - int64_t dim, size; - size_t gpu_memory_size_ = sizeof(int64_t) * nums; - std::vector strides, sizes; - strides.resize(nums); - sizes.resize(nums); - paddle::memory::AllocationPtr shifts_gpu = - memory::Alloc(context.GetPlace(), gpu_memory_size_); - paddle::memory::AllocationPtr strides_gpu = - memory::Alloc(context.GetPlace(), gpu_memory_size_); - paddle::memory::AllocationPtr sizes_gpu = - memory::Alloc(context.GetPlace(), gpu_memory_size_); - - for (size_t i = 0; i < nums; i++) { - dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - size = input_dim[dim]; - shifts[i] = (shifts[i] % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; + std::vector strides(nums), sizes(nums); + if (dims.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts[0] = (shifts[0] % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); + int64_t size = input_dim[dim]; + + shifts[i] = (shifts[i] % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } + } + +#define CALL_ROLL_CUDA_KERNEL(N) \ + case N: { \ + paddle::framework::Array _strides; \ + paddle::framework::Array _shifts; \ + paddle::framework::Array _sizes; \ + for (size_t idx = 0; idx < N; ++idx) { \ + _strides[idx] = strides[idx]; \ + _shifts[idx] = shifts[idx]; \ + _sizes[idx] = sizes[idx]; \ + } \ + RollCudaKernel< \ + T, \ + N><<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS, \ + PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_data, out_data, numel, \ + _shifts, _strides, _sizes); \ + break; \ + } + + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts.size())); } - paddle::memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()), - shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(), - gpu_memory_size_, stream); - paddle::memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()), - strides_gpu->ptr(), platform::CPUPlace(), strides.data(), - gpu_memory_size_, stream); - paddle::memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()), - sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_, - stream); - int64_t* shifts_ptr = reinterpret_cast(shifts_gpu->ptr()); - int64_t* strides_ptr = reinterpret_cast(strides_gpu->ptr()); - int64_t* sizes_ptr = reinterpret_cast(sizes_gpu->ptr()); - - roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums); } }; -template -class RollGradCUDAKernel : public framework::OpKernel { +template +class RollGradKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input(framework::GradVarName("Out")); @@ -121,46 +142,38 @@ class RollGradCUDAKernel : public framework::OpKernel { auto input_dim = in->dims(); auto stride_dim = framework::stride(input_dim); - int64_t dim, size; - size_t gpu_memory_size_ = sizeof(int64_t) * nums; - std::vector strides, sizes; - strides.resize(nums); - sizes.resize(nums); - paddle::memory::AllocationPtr shifts_gpu = - memory::Alloc(context.GetPlace(), gpu_memory_size_); - paddle::memory::AllocationPtr strides_gpu = - memory::Alloc(context.GetPlace(), gpu_memory_size_); - paddle::memory::AllocationPtr sizes_gpu = - memory::Alloc(context.GetPlace(), gpu_memory_size_); - - for (size_t i = 0; i < nums; i++) { - dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); - size = input_dim[dim]; - shifts[i] = ((0 - shifts[i]) % size + size) % size; - strides[i] = stride_dim[dim]; - sizes[i] = size; + std::vector strides(nums), sizes(nums); + if (dims.size() == 0) { + strides[0] = 1; + sizes[0] = numel; + shifts[0] = ((-shifts[0]) % numel + numel) % numel; + } else { + for (size_t i = 0; i < nums; i++) { + int dim = dims[i] >= 0 ? dims[i] : dims[i] + input_dim.size(); + int64_t size = input_dim[dim]; + + shifts[i] = ((-shifts[i]) % size + size) % size; + strides[i] = stride_dim[dim]; + sizes[i] = size; + } } - paddle::memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, shifts_gpu->place()), - shifts_gpu->ptr(), platform::CPUPlace(), shifts.data(), - gpu_memory_size_, stream); - paddle::memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, strides_gpu->place()), - strides_gpu->ptr(), platform::CPUPlace(), strides.data(), - gpu_memory_size_, stream); - paddle::memory::Copy( - BOOST_GET_CONST(platform::CUDAPlace, sizes_gpu->place()), - sizes_gpu->ptr(), platform::CPUPlace(), sizes.data(), gpu_memory_size_, - stream); - int64_t* shifts_ptr = reinterpret_cast(shifts_gpu->ptr()); - int64_t* strides_ptr = reinterpret_cast(strides_gpu->ptr()); - int64_t* sizes_ptr = reinterpret_cast(sizes_gpu->ptr()); - - roll_cuda_kernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / - PADDLE_CUDA_NUM_THREADS, - PADDLE_CUDA_NUM_THREADS, 0, stream>>>( - in_data, out_data, numel, shifts_ptr, strides_ptr, sizes_ptr, nums); + switch (nums) { + CALL_ROLL_CUDA_KERNEL(1); + CALL_ROLL_CUDA_KERNEL(2); + CALL_ROLL_CUDA_KERNEL(3); + CALL_ROLL_CUDA_KERNEL(4); + CALL_ROLL_CUDA_KERNEL(5); + CALL_ROLL_CUDA_KERNEL(6); + CALL_ROLL_CUDA_KERNEL(7); + CALL_ROLL_CUDA_KERNEL(8); + CALL_ROLL_CUDA_KERNEL(9); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "shifts.size() should be less than 10, But received shifts.size() " + "= %d", + shifts.size())); + } } }; @@ -169,13 +182,12 @@ class RollGradCUDAKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - roll, ops::RollCUDAKernel, - ops::RollCUDAKernel, - ops::RollCUDAKernel, - ops::RollCUDAKernel); + roll, ops::RollKernel, + ops::RollKernel, + ops::RollKernel, + ops::RollKernel); REGISTER_OP_CUDA_KERNEL( - roll_grad, - ops::RollGradCUDAKernel, - ops::RollGradCUDAKernel, - ops::RollGradCUDAKernel, - ops::RollGradCUDAKernel); + roll_grad, ops::RollGradKernel, + ops::RollGradKernel, + ops::RollGradKernel, + ops::RollGradKernel); diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h index 74dd37ed8388fe495cf5bf6cc859dd899fdd87dd..da4f335ca7faa62504b6426bce37c63c4e0f17e3 100644 --- a/paddle/fluid/operators/roll_op.h +++ b/paddle/fluid/operators/roll_op.h @@ -88,7 +88,13 @@ class RollKernel : public framework::OpKernel { TensorToVector(input, context.device_context(), &out_vec); size_t nums = shifts.size(); - const DDim input_dim = input.dims(); + DDim input_dim = input.dims(); + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = framework::Dim<1>(out_vec.size()); + } for (size_t i = 0; i < nums; i++) { PADDLE_ENFORCE_EQ( @@ -101,7 +107,7 @@ class RollKernel : public framework::OpKernel { } output->mutable_data(context.GetPlace()); framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input_dim); + output->Resize(input.dims()); } }; @@ -120,14 +126,20 @@ class RollGradKernel : public framework::OpKernel { TensorToVector(input, context.device_context(), &out_vec); size_t nums = shifts.size(); - const DDim input_dim = input.dims(); + DDim input_dim = input.dims(); + + // axis = none, reshape to 1-D tensor + if (dims.size() == 0) { + dims.push_back(0l); + input_dim = framework::Dim<1>(out_vec.size()); + } for (size_t i = 0; i < nums; i++) { shift_along_dim(out_vec.data(), input_dim, dims[i], 0 - shifts[i]); } output->mutable_data(context.GetPlace()); framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input_dim); + output->Resize(input.dims()); } }; diff --git a/paddle/fluid/operators/run_program_op.cc b/paddle/fluid/operators/run_program_op.cc index 2d599716443901053aa3d5dc8e93759320175b24..69b2c5b73800738ed740cc59786c42222a1d9e35 100644 --- a/paddle/fluid/operators/run_program_op.cc +++ b/paddle/fluid/operators/run_program_op.cc @@ -83,6 +83,13 @@ class RunProgramOpMaker : public framework::OpProtoAndCheckerMaker { "contains at most one scope." "NOTE: Do not use Scope directly because Scope output is not " "currently supported."); + AddOutput("DOut", + "(vector)" + "The output tensors for GRAD Tensors in RunProgram forward " + "operator, the forward operator contains GRAD Tensors when it " + "computes double grad.") + .AsDuplicable() + .AsDispensable(); AddAttr("global_block", "(BlockDesc *)" "The global block of executed program desc."); @@ -154,6 +161,7 @@ class RunProgramGradOpMaker : public framework::SingleGradOpMaker { grad_op->SetInput("Params", this->Input("Params")); grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); grad_op->SetInput("OutScope", this->Output("OutScope")); + grad_op->SetInput("DOut", this->Output("DOut")); grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); grad_op->SetOutput(framework::GradVarName("Params"), this->InputGrad("Params")); diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index f78f5c5b948c63e02d9121c540b6207c30b2d0f9..c7aeb0e145e4cb704c56dabb2f090e63ecb280a7 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -131,6 +131,9 @@ static void ShareVarsIntoScope(const std::vector &vars, const std::vector &var_names, framework::Scope *scope) { for (size_t i = 0; i < vars.size(); ++i) { + if (var_names[i] == "Fake_var") { + continue; + } auto *var = scope->Var(var_names[i]); CheckInputVarStatus(*vars[i], var_names[i]); VariableShare(*vars[i], var); @@ -141,9 +144,9 @@ static void ShareVarsFromScope(const std::vector &vars, const std::vector &var_names, framework::Scope *scope) { for (size_t i = 0; i < vars.size(); ++i) { - if (var_names[i] == framework::kEmptyVarName) { - VLOG(2) << "find variable name is " << framework::kEmptyVarName - << ", skip it!"; + if (var_names[i] == framework::kEmptyVarName || + var_names[i] == "Fake_var") { + VLOG(2) << "find variable name is " << var_names[i] << ", skip it!"; continue; } // NOTE: Here skip not found var is dangerous, if a bug is caused here, @@ -170,9 +173,11 @@ class RunProgramOpKernel : public framework::OpKernel { auto &input_vars = ctx.MultiInputVar("X"); auto ¶m_vars = ctx.MultiInputVar("Params"); auto output_vars = ctx.MultiOutputVar("Out"); + auto dout_vars = ctx.MultiOutputVar("DOut"); auto input_var_names = ctx.InputNames("X"); auto output_var_names = ctx.OutputNames("Out"); + auto dout_var_names = ctx.OutputNames("DOut"); // current program may not hold parameters std::vector param_names; @@ -195,7 +200,7 @@ class RunProgramOpKernel : public framework::OpKernel { // Step 2. prepare executor and init persistable variables framework::Executor exe(ctx.GetPlace()); auto exe_ctx = framework::GetExecutorInfoFromCache( - exe, ctx, {output_var_names}, /*is_grad=*/false); + exe, ctx, {output_var_names, dout_var_names}, /*is_grad=*/false); // NOTE(Aurelius84): While training some models, forward can be called many // times and then apply backpropagation all at once, such as Reinforcement @@ -219,6 +224,7 @@ class RunProgramOpKernel : public framework::OpKernel { // Step 4. Get Output details::ShareVarsFromScope(output_vars, output_var_names, &scope); + details::ShareVarsFromScope(dout_vars, dout_var_names, &scope); // Debug info: scope info when run end VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front()); diff --git a/paddle/fluid/operators/save_combine_op.cc b/paddle/fluid/operators/save_combine_op.cc index ec038f16113dda3915dde167ba49b6be245c9f02..6da73c99068bc0e0453dfdd1b5eca8e1add1954b 100644 --- a/paddle/fluid/operators/save_combine_op.cc +++ b/paddle/fluid/operators/save_combine_op.cc @@ -102,5 +102,7 @@ REGISTER_OP_CPU_KERNEL( save_combine, ops::SaveCombineOpKernel, ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, ops::SaveCombineOpKernel, ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc index 5594de16b6789e99d5c4cc6828889eb0e311624a..493f5081ee42b9232a680dace585473d3217eedc 100644 --- a/paddle/fluid/operators/save_load_combine_op_test.cc +++ b/paddle/fluid/operators/save_load_combine_op_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" USE_CPU_ONLY_OP(save_combine); @@ -76,33 +77,34 @@ void CheckValues(T* expect, U* actual, const paddle::framework::LoD& expect_lod, // Here, we create 4 LoDTensors and use save_combine_op to first save these // in a single file. Then, we use load_combine_op to load these sequentially -TEST(SaveLoadCombineOp, CPU) { +template +void SaveLoadCombineOp() { paddle::framework::Scope scope; paddle::platform::CPUPlace place; std::vector lod1 = {0, 1, 2, 3, 10}; int numel1 = 100; paddle::framework::LoD expect_lod1; - int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", - place, &scope, &expect_lod1); + T* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, + &scope, &expect_lod1); std::vector lod2 = {0, 2, 5, 10}; int numel2 = 200; paddle::framework::LoD expect_lod2; - int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", - place, &scope, &expect_lod2); + T* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, + &scope, &expect_lod2); std::vector lod3 = {0, 2, 3, 20}; int numel3 = 4000; paddle::framework::LoD expect_lod3; - int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", - place, &scope, &expect_lod3); + T* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place, + &scope, &expect_lod3); std::vector lod4 = {0, 1, 20}; int numel4 = 1000; paddle::framework::LoD expect_lod4; - int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", - place, &scope, &expect_lod4); + T* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, + &scope, &expect_lod4); // Set attributes std::string filename = "check_tensor.ls"; @@ -128,15 +130,21 @@ TEST(SaveLoadCombineOp, CPU) { load_combine_op->Run(scope, place); paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4; - int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, &actual_lod1); - int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, &actual_lod2); - int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, &actual_lod3); - int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, &actual_lod4); - - CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1); - CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2); - CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3); - CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4); + U* actual1 = GetValuesAfterLoadCombineOp(target1, scope, &actual_lod1); + U* actual2 = GetValuesAfterLoadCombineOp(target2, scope, &actual_lod2); + U* actual3 = GetValuesAfterLoadCombineOp(target3, scope, &actual_lod3); + U* actual4 = GetValuesAfterLoadCombineOp(target4, scope, &actual_lod4); + + CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1); + CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2); + CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3); + CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4); +} + +TEST(SaveLoadCombineOp, CPU) { SaveLoadCombineOp(); } + +TEST(SaveLoadCombineBF16Op, CPU) { + SaveLoadCombineOp(); } // FP16 version of SaveLoadCombineOp Test, only altering the saving aspect diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 194274cdd5bb4d59188e171866f685b127cb1369..d819c172e4a9d7b6911cd3f4bac66b342882b347 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -90,6 +90,8 @@ REGISTER_OP_CPU_KERNEL( ops::SaveOpKernel, ops::SaveOpKernel, + ops::SaveOpKernel, ops::SaveOpKernel, ops::SaveOpKernel, ops::SaveOpKernel, diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index a9b1f299dab82791e6a98afb2b75d65b1703a5a2..a195452791048d9875602285551a00cf6e42c7a8 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/scale_op.h" #include +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace framework { @@ -54,6 +55,21 @@ class ScaleOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,6 +103,9 @@ $$Out = scale*(X + bias)$$ "Apply bias addition after or before scaling. It is useful for " "numeric stability in some circumstances.") .SetDefault(true); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); } }; @@ -112,6 +131,8 @@ class ScaleGradMaker : public framework::SingleGradOpMaker { grad_op->SetAttr("scale", this->GetAttr("scale")); grad_op->SetAttr("bias", 0.0f); grad_op->SetAttr("bias_after_scale", true); + if (grad_op->HasAttr("use_mkldnn")) + grad_op->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn")); } }; @@ -135,3 +156,18 @@ REGISTER_OP_CPU_KERNEL( ops::ScaleKernel, ops::ScaleKernel, ops::ScaleKernel); + +REGISTER_OP_CUDA_KERNEL( + scale, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel); diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu deleted file mode 100644 index e1f20a73b20fc23ec8b99ba0e5154eb184718ca3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/scale_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/scale_op.h" -#include "paddle/fluid/platform/float16.h" -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - scale, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel); diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h index 11c81d23b2ed271ce89e6a27b1179e7d06dd0ebd..544f0a916681e6fe0042b0e7c3af537f5d464214 100644 --- a/paddle/fluid/operators/scale_op.h +++ b/paddle/fluid/operators/scale_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -68,11 +69,8 @@ class ScaleKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); auto& dev = *ctx.template device_context().eigen_device(); - if (bias_after_scale) { - eigen_out.device(dev) = scale * eigen_in + bias; - } else { - eigen_out.device(dev) = scale * (eigen_in + bias); - } + EigenScale, T>::Eval( + dev, eigen_out, eigen_in, scale, bias, bias_after_scale); } }; diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index cbfd11834ae47710bc8b80df15400689a50af6bc..6fb0e6d372745dc412a653e2fa27b398d1e16a5e 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -38,7 +38,7 @@ class ScaleNPUKernel : public framework::OpKernel { << " ,bias_after_scale:" << bias_after_scale; if (bias_after_scale) { out->mutable_data(ctx.GetPlace()); - auto runner = + const auto& runner = NpuOpRunner("Power", {*x}, {*out}, {{"power", _power}, {"scale", scale}, {"shift", bias}}); @@ -47,12 +47,13 @@ class ScaleNPUKernel : public framework::OpKernel { Tensor tmp_x(x->type()); tmp_x.Resize(x->dims()); tmp_x.mutable_data(ctx.GetPlace()); - auto runner_tmp = NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}}); + const auto& runner_tmp = + NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}}); runner_tmp.Run(stream); out->mutable_data(ctx.GetPlace()); float _bias = 0.0; - auto runner = + const auto& runner = NpuOpRunner("Power", {tmp_x}, {*out}, {{"power", _power}, {"scale", scale}, {"shift", _bias}}); runner.Run(stream); diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index b116a78891a93100942fa1d3cfb215e4fcc3b37d..61e95c2b50eb729f78a5e6340863ad63a0e60ba5 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -33,6 +33,14 @@ __global__ void ScatterInitCUDAKernel(const IndexT* indices, T* output, int indices_i = i / slice_size; int slice_i = i - indices_i * slice_size; // offset inside the slice IndexT scatter_i = indices[indices_i]; + + PADDLE_ENFORCE(scatter_i >= 0, + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be greater than or equal to 0, but received [%d]", + scatter_i); + IndexT out_i = scatter_i * slice_size + slice_i; *(output + out_i) = static_cast(0); } @@ -46,6 +54,14 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices, int indices_i = i / slice_size; int slice_i = i - indices_i * slice_size; // offset inside the slice IndexT scatter_i = indices[indices_i]; + + PADDLE_ENFORCE(scatter_i >= 0, + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be greater than or equal to 0, but received [%d]", + scatter_i); + IndexT out_i = scatter_i * slice_size + slice_i; if (overwrite) { *(output + out_i) = *(params + i); @@ -67,6 +83,15 @@ __global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices, int64_t temp = slice_size; for (int64_t j = end_size - 1; j >= 0; --j) { IndexT index_value = indices[indices_i * end_size + j]; + + PADDLE_ENFORCE( + index_value >= 0 && index_value < output_dims[j], + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be less than [%d] and greater or equal to 0, but received [%d]", + output_dims[j], index_value); + gather_i += (index_value * temp); temp *= output_dims[j]; } diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h index 864a94a4235e65d67b960f444bb86a48c3af8159..2589033d2fef7202fc396ab8890e7a82b43d2ddd 100644 --- a/paddle/fluid/operators/scatter.h +++ b/paddle/fluid/operators/scatter.h @@ -118,6 +118,15 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, for (int i = 0; i < index_size; ++i) { IndexT index_ = p_index[i]; + + PADDLE_ENFORCE_GE(index_, 0, + platform::errors::OutOfRange( + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be greater than or equal to 0, but received [%d]", + index_)); + memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes); } } @@ -173,6 +182,15 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src, // if not in overwrite mode, need to init output data for (int i = 0; i < index_size; ++i) { const IndexT& index_ = p_index[i]; + + PADDLE_ENFORCE_GE(index_, 0, + platform::errors::OutOfRange( + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be greater than or equal to 0, but received [%d]", + index_)); + elementwise_inner_add(ctx, p_src, p_output, result_p_output, src, output, i, index_, slice_size, slice_bytes); @@ -233,6 +251,15 @@ void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update, IndexT temp = 1; for (int64_t j = end_size - 1; j >= 0; --j) { IndexT index_value = p_index[i * end_size + j]; + PADDLE_ENFORCE_EQ( + (index_value >= 0 && index_value < output_dims[j]), true, + platform::errors::OutOfRange( + "The index is out of bounds, " + "please check whether the dimensions of index and " + "input meet the requirements. It should " + "be less than [%d] and greater or equal to 0, but received [%d]", + output_dims[j], index_value)); + index_ += (index_value * temp); temp *= output_dims[j]; } diff --git a/paddle/fluid/operators/scatter_op_npu.cc b/paddle/fluid/operators/scatter_op_npu.cc index e2e49acb94c7b22120acbd614c2f0ac139540f3c..d0183c6ed57c4dd59f51b8246772287024b8bf77 100644 --- a/paddle/fluid/operators/scatter_op_npu.cc +++ b/paddle/fluid/operators/scatter_op_npu.cc @@ -53,11 +53,11 @@ class ScatterNPUKernel : public framework::OpKernel { .stream(); if (overwrite) { - auto runner_update = NpuOpRunner("TensorScatterUpdate", - {*x, *index, *updates}, {*out}, {}); + const auto& runner_update = NpuOpRunner( + "TensorScatterUpdate", {*x, *index, *updates}, {*out}, {}); runner_update.Run(stream); } else { - auto runner_add = + const auto& runner_add = NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*out}, {}); runner_add.Run(stream); } diff --git a/paddle/fluid/operators/scatter_test.cc b/paddle/fluid/operators/scatter_test.cc index c83726180baeae6f4b73adda3bd9d9127b0f3e26..f94fce66806eee82f2c3434161426a19aa9d916e 100644 --- a/paddle/fluid/operators/scatter_test.cc +++ b/paddle/fluid/operators/scatter_test.cc @@ -54,4 +54,6 @@ TEST(scatter, ScatterUpdate) { EXPECT_EQ(output.data()[i], static_cast(i - 4)); for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], 0.0f); for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output.data()[i], 0.0f); + + delete cpu_place; } diff --git a/paddle/fluid/operators/seed_op_npu.cc b/paddle/fluid/operators/seed_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..e4466cdecae2124ced60682f4a47618d0921d3d2 --- /dev/null +++ b/paddle/fluid/operators/seed_op_npu.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/seed_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class NPUSeedKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Output("Out"); + int user_seed = ctx.Attr("seed"); + std::random_device rnd; + int seed; + + if (user_seed != 0) { + seed = user_seed; + } else { + seed = rnd(); + } + + out->mutable_data(ctx.GetPlace()); + FillNpuTensorWithConstant(out, seed); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + seed, ops::NPUSeedKernel); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index eca51147f8159e1bcb7c0c88ca7760e4f62e5543..c7b61333cdab3d2cadf8bf6af1b3e4b2df5ed6f0 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -23,6 +23,7 @@ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/slice_utils.h" #include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/enforce.h" @@ -59,106 +60,6 @@ inline std::string GetValueName(framework::proto::VarType::Type data_type) { return value_name; } -inline void CheckAndUpdateSlice(const framework::DDim in_dims, - const std::vector axes, - std::vector* starts, - std::vector* ends, - std::vector* steps) { - for (size_t i = 0; i < axes.size(); ++i) { - int64_t axis = axes[i]; - int64_t dim_value = in_dims[axis]; - - int64_t start = - (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i]; - int64_t end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i]; - start = std::max(start, static_cast(0)); - end = std::min(end, dim_value); - - int64_t step = (*steps)[i]; - PADDLE_ENFORCE_NE( - step, 0, platform::errors::InvalidArgument( - "Step should not be 0, but received step = %d.", step)); - if (step > 0) { - start = std::min(start, dim_value); - end = std::max(end, static_cast(0)); - PADDLE_ENFORCE_GT( - end, start, - platform::errors::InvalidArgument( - "When step > 0, end should be greater than start, but " - "received end = %d, start = %d.", - end, start)); - } else { - // NOTE(liym27): When step < 0, start should less and equal to dim_value-1 - // "end is -1" means contain the 0-th element of this axis. - start = std::min(start, dim_value - 1); - end = std::max(end, static_cast(-1)); - PADDLE_ENFORCE_GT( - start, end, - platform::errors::InvalidArgument( - "When step < 0, start should be greater than end, but " - "received start = %d, end = %d.", - start, end)); - } - - (*starts)[i] = start; - (*ends)[i] = end; - } -} - -inline framework::DDim GetSliceDims(const framework::DDim in_dims, - const std::vector& axes, - const std::vector& starts, - const std::vector& ends, - const std::vector& steps) { - framework::DDim slice_dims(in_dims); - - for (size_t i = 0; i < axes.size(); ++i) { - int64_t axis = axes[i]; - int64_t start = starts[i]; - int64_t end = ends[i]; - int64_t step = steps[i]; - - if (step > 0) { - slice_dims[axis] = (end - start + step - 1) / step; - } else { - slice_dims[axis] = (end - start + step + 1) / step; - } - } - return slice_dims; -} - -inline framework::DDim GetDecreasedDims( - const framework::DDim slice_dims, - const std::vector& decrease_axes) { - // Get dims after decreasing axes. - framework::DDim decreased_dims(slice_dims); - if (decrease_axes.size() > 0) { - for (size_t i = 0; i < decrease_axes.size(); ++i) { - int64_t axis = decrease_axes[i]; - PADDLE_ENFORCE_EQ( - decreased_dims[axis], 1, - platform::errors::InvalidArgument("decrease dim should be 1")); - decreased_dims[axis] = 0; - } - - std::vector new_shape; - for (int i = 0; i < decreased_dims.size(); ++i) { - if (decreased_dims[i] != 0) { - new_shape.push_back(decreased_dims[i]); - } - } - - // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and - // uses [1] instead. - if (new_shape.size() == 0) { - new_shape.push_back(1); - } - - decreased_dims = framework::make_ddim(new_shape); - } - return decreased_dims; -} - template class SetValueKernel : public framework::OpKernel { public: @@ -225,8 +126,8 @@ class SetValueKernel : public framework::OpKernel { } auto in_dims = in->dims(); - CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps); - auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps); + CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); + auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/share_data_op.cc b/paddle/fluid/operators/share_data_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..6fcc29e90026165f9ada90d372498c9fced02a39 --- /dev/null +++ b/paddle/fluid/operators/share_data_op.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/share_data_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ShareDataOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ShareData"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "ShareData"); + auto in_type = ctx->GetInputsVarType("X")[0]; + auto out_type = ctx->GetOutputsVarType("Out")[0]; + + PADDLE_ENFORCE_EQ( + in_type == framework::proto::VarType::LOD_TENSOR || + in_type == framework::proto::VarType::SELECTED_ROWS, + true, platform::errors::InvalidArgument( + "Type of Variable[X] must be LoDTensor or SelectedRows!")); + PADDLE_ENFORCE_EQ( + in_type, out_type, + platform::errors::InvalidArgument( + "The type of input (X) and output (Out) are inconsistent.")); + + ctx->ShareDim("X", "Out"); + } +}; + +class ShareDataOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of share_data op"); + AddOutput("Out", "(Tensor), The output tensor of share_data op"); + AddComment(R"DOC( +ShareData Operator. + +Return a tensor $Out$ that shares data with the input tensor $X$ and without tensor copy. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + share_data, ops::ShareDataOp, ops::ShareDataOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(share_data, ops::ShareDataKernel, + ops::ShareDataKernel, ops::ShareDataKernel, + ops::ShareDataKernel, + ops::ShareDataKernel, + ops::ShareDataKernel, + ops::ShareDataKernel, + ops::ShareDataKernel) diff --git a/paddle/fluid/operators/sign_op.cu b/paddle/fluid/operators/share_data_op.cu similarity index 51% rename from paddle/fluid/operators/sign_op.cu rename to paddle/fluid/operators/share_data_op.cu index 817e0fbbd511462f161633242d28e63062676eb9..20cdaafa43de72502bffb5a36f6037a3524047a9 100644 --- a/paddle/fluid/operators/sign_op.cu +++ b/paddle/fluid/operators/share_data_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/sign_op.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/operators/share_data_op.h" REGISTER_OP_CUDA_KERNEL( - sign, - paddle::operators::SignKernel, - paddle::operators::SignKernel, - paddle::operators::SignKernel); + share_data, paddle::operators::ShareDataKernel, + paddle::operators::ShareDataKernel, + paddle::operators::ShareDataKernel, + paddle::operators::ShareDataKernel, + paddle::operators::ShareDataKernel, + paddle::operators::ShareDataKernel, + paddle::operators::ShareDataKernel, + paddle::operators::ShareDataKernel); diff --git a/paddle/fluid/operators/share_data_op.h b/paddle/fluid/operators/share_data_op.h new file mode 100644 index 0000000000000000000000000000000000000000..d876b4fabd5c09bf32322cf1a63e0c0fe7ed7d25 --- /dev/null +++ b/paddle/fluid/operators/share_data_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class ShareDataKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *in_var = ctx.InputVar("X"); + auto *out_var = ctx.OutputVar("Out"); + if (in_var->IsType()) { + const auto &origin_tensor = in_var->Get(); + auto *detach_tensor = out_var->GetMutable(); + detach_tensor->ShareDataWith(origin_tensor); + } else { + const auto &origin_selected_rows = in_var->Get(); + auto *detach_selected_rows = + out_var->GetMutable(); + detach_selected_rows->mutable_value()->ShareDataWith( + origin_selected_rows.value()); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/shuffle_batch_op.cc b/paddle/fluid/operators/shuffle_batch_op.cc index e540c728b69fe1e91bb9700871ff955d6d5b24a9..20459f92f3a590c114a07bcdc91fb5de49aaa3a4 100644 --- a/paddle/fluid/operators/shuffle_batch_op.cc +++ b/paddle/fluid/operators/shuffle_batch_op.cc @@ -53,6 +53,16 @@ class ShuffleBatchOp : public framework::OperatorWithKernel { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); return framework::OpKernelType(data_type, ctx.device_context()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "Seed") { + return expected_kernel_type; + } + return framework::OperatorWithKernel::GetKernelTypeForVar( + var_name, tensor, expected_kernel_type); + } }; class ShuffleBatchOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/shuffle_batch_op.cu b/paddle/fluid/operators/shuffle_batch_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..02210e64fb439828b0a706ac578a4ffb91489958 --- /dev/null +++ b/paddle/fluid/operators/shuffle_batch_op.cu @@ -0,0 +1,159 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + +#ifndef _MSC_VER +#include +#include +#include +#include +#endif + +#include "paddle/fluid/operators/shuffle_batch_op.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +struct ReorderFunctor { + ReorderFunctor(const T *x, const int64_t *shuffle_idx, T *y, int64_t stride) + : x_(x), shuffle_idx_(shuffle_idx), y_(y), stride_(stride) {} + + HOSTDEVICE void operator()(int64_t idx) { + auto reorder_idx = shuffle_idx_[idx / stride_] * stride_ + idx % stride_; + if (kIsForward) { + y_[idx] = x_[reorder_idx]; + } else { + y_[reorder_idx] = x_[idx]; + } + } + + private: + const T *x_; + const int64_t *shuffle_idx_; + T *y_; + int64_t stride_; +}; + +template +class ShuffleBatchCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { +#ifdef _MSC_VER + PADDLE_THROW(platform::errors::Unimplemented( + "GPU shuffle_batch is not supported on Windows yet")); +#else + auto *x = ctx.Input("X"); + auto *seed = ctx.Input("Seed"); + auto *out = ctx.Output("Out"); + auto *shuffleidx = ctx.Output("ShuffleIdx"); + auto *seed_out = ctx.Output("SeedOut"); + + int64_t x_embed_size = x->dims()[x->dims().size() - 1]; + int64_t elem_size = 1; + for (int i = 0; i < x->dims().size() - 1; i++) { + elem_size *= x->dims()[i]; + } + shuffleidx->Resize(framework::make_ddim({elem_size})); + + int64_t seed_int = 0; + if (seed->IsInitialized()) { + const auto &seed_place = seed->place(); + if (platform::is_gpu_place(seed_place)) { + // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would + // not be CUDAPlace in practice. This case would only happen in Python + // op_test framework. + framework::Tensor tmp_tensor; + framework::TensorCopySync(*seed, platform::CPUPlace(), &tmp_tensor); + seed_int = *(tmp_tensor.data()); + } else { + seed_int = *(seed->data()); + } + } else { + seed_int = ctx.Attr("startup_seed"); + } + + auto *shuffleidx_data = shuffleidx->mutable_data(ctx.GetPlace()); + + auto &dev_ctx = ctx.template device_context(); +#ifdef PADDLE_WITH_CUDA + const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream()); +#else + const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream()); +#endif + thrust::random::default_random_engine engine(seed_int); + thrust::counting_iterator cnt_iter(0); + thrust::shuffle_copy(exec_policy, cnt_iter, cnt_iter + elem_size, + thrust::device_pointer_cast(shuffleidx_data), engine); + // TODO(zengjinle): for small data, direct cudaMemcpy may be better + auto *x_data = x->data(); + auto *out_data = out->mutable_data(ctx.GetPlace()); + ReorderFunctor functor(x_data, shuffleidx_data, out_data, + x_embed_size); + platform::ForRange for_range( + dev_ctx, elem_size * x_embed_size); + for_range(functor); + + auto *seed_out_data = seed_out->mutable_data( + framework::make_ddim({1}), platform::CPUPlace()); + *seed_out_data = engine(); +#endif + } +}; + +template +class ShuffleBatchGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { +#ifdef _MSC_VER + PADDLE_THROW(platform::errors::Unimplemented( + "GPU shuffle_batch_grad is not supported on Windows yet")); +#else + const auto *out_grad = + ctx.Input(framework::GradVarName("Out")); + const auto *shuffleidx = ctx.Input("ShuffleIdx"); + auto *x_grad = ctx.Output(framework::GradVarName("X")); + + const auto *out_grad_data = out_grad->data(); + const auto *shuffleidx_data = shuffleidx->data(); + auto *x_grad_data = x_grad->mutable_data(ctx.GetPlace()); + auto x_embed_size = x_grad->dims()[x_grad->dims().size() - 1]; + ReorderFunctor functor(out_grad_data, shuffleidx_data, + x_grad_data, x_embed_size); + auto &dev_ctx = ctx.template device_context(); + // TODO(zengjinle): for small data, direct cudaMemcpy may be better + platform::ForRange for_range(dev_ctx, + x_grad->numel()); + for_range(functor); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(shuffle_batch, ops::ShuffleBatchCUDAKernel, + ops::ShuffleBatchCUDAKernel, + ops::ShuffleBatchCUDAKernel, + ops::ShuffleBatchCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(shuffle_batch_grad, + ops::ShuffleBatchGradCUDAKernel, + ops::ShuffleBatchGradCUDAKernel, + ops::ShuffleBatchGradCUDAKernel, + ops::ShuffleBatchGradCUDAKernel); +#endif diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc index 3485b4e5c2fbebd83e8f5ee34437db35ce5f1f20..6207c33f9d6299605d24f11c13820eac47ee6c98 100644 --- a/paddle/fluid/operators/sign_op.cc +++ b/paddle/fluid/operators/sign_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/sign_op.h" #include +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { @@ -69,3 +70,10 @@ REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, REGISTER_OP_CPU_KERNEL( sign, ops::SignKernel, ops::SignKernel); + +REGISTER_OP_CUDA_KERNEL( + sign, + paddle::operators::SignKernel, + paddle::operators::SignKernel, + paddle::operators::SignKernel); diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h index b99934daee17e2b8a9295b488c0483e47187a009..b6d501afa621ac490be4ef3e567434779c61b0aa 100644 --- a/paddle/fluid/operators/sign_op.h +++ b/paddle/fluid/operators/sign_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" namespace paddle { namespace operators { @@ -31,7 +32,8 @@ class SignKernel : public framework::OpKernel { auto eigen_in = framework::EigenVector::Flatten(*in); auto& place = *context.template device_context().eigen_device(); - eigen_out.device(place) = eigen_in.sign(); + EigenSign, T>::Eval(place, eigen_out, + eigen_in); } }; diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 0a41424cfa11864879ff93d3807a3746a685b00d..01daba7c072845e47cf5aa176a4b7e060ee2d942 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -28,13 +28,10 @@ class SliceOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - platform::errors::InvalidArgument( - "Input (Input) of slice op should not be null.")); + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "slice"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "slice"); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output (Out) of slice op should not be null.")); + // Case 1: Special treatment when input is a tensor array. auto x_var_type = ctx->GetInputsVarType("Input")[0]; auto axes = ctx->Attrs().Get>("axes"); if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) { @@ -57,6 +54,8 @@ class SliceOp : public framework::OperatorWithKernel { return; } } + + // Case 2: input is a tensor. auto in_dims = ctx->GetInputDim("Input"); PADDLE_ENFORCE_LT(in_dims.size(), 7, platform::errors::InvalidArgument( @@ -65,101 +64,54 @@ class SliceOp : public framework::OperatorWithKernel { auto starts = ctx->Attrs().Get>("starts"); auto ends = ctx->Attrs().Get>("ends"); - auto infer_flags = ctx->Attrs().Get>("infer_flags"); auto decrease_axis = ctx->Attrs().Get>("decrease_axis"); - - auto starts_size = starts.size(); - auto ends_size = ends.size(); + auto infer_flags = ctx->Attrs().Get>("infer_flags"); if (infer_flags.empty()) { // Initialize infer_flags with 1. // To be compatible with other op tests in which infer_flags is not set. infer_flags = std::vector(axes.size(), 1); } + // 2.1 Check attrs. + auto starts_size = starts.size(); + auto ends_size = ends.size(); + if (ctx->HasInputs("StartsTensorList")) { - auto StartsTensorList = ctx->Inputs("StartsTensorList"); - PADDLE_ENFORCE_GT(StartsTensorList.size(), 0, + starts_size = ctx->Inputs("StartsTensorList").size(); + PADDLE_ENFORCE_GT(starts_size, 0, platform::errors::InvalidArgument( "StartsTensorList size can't be zero")); - starts_size = StartsTensorList.size(); } if (ctx->HasInputs("EndsTensorList")) { - auto EndsTensorList = ctx->Inputs("EndsTensorList"); - PADDLE_ENFORCE_GT(EndsTensorList.size(), 0, - platform::errors::InvalidArgument( - "EndsTensorList size can't be zero")); - ends_size = EndsTensorList.size(); + ends_size = ctx->Inputs("EndsTensorList").size(); + PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument( + "EndsTensorList size can't be zero")); } - if (ctx->HasInput("StartsTensor") == false) { + if (!ctx->HasInput("StartsTensor")) { PADDLE_ENFORCE_EQ( starts_size, axes.size(), platform::errors::InvalidArgument( "The size of starts must be equal to the size of axes.")); } - if (ctx->HasInput("EndsTensor") == false) { + if (!ctx->HasInput("EndsTensor")) { PADDLE_ENFORCE_EQ( ends_size, axes.size(), platform::errors::InvalidArgument( "The size of ends must be equal to the size of axes.")); } - int dim_value, start, end; - for (size_t i = 0; i < axes.size(); ++i) { - PADDLE_ENFORCE_LT(static_cast(axes[i]), in_dims.size(), - platform::errors::InvalidArgument( - "The index of dimension in axes must be less " - "than the size of input shape.")); - if (infer_flags[i] == -1) { - out_dims[axes[i]] = -1; - } else { - // infer out_dim shape - dim_value = out_dims[axes[i]]; - if (dim_value > 0) { - start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; - end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim_value); - - PADDLE_ENFORCE_LE(start, dim_value, - platform::errors::InvalidArgument( - "start should be less than or equal to the " - "dimension value, but received " - "start = %d, shape[%d] = %d.", - starts[i], axes[i], out_dims[axes[i]])); - PADDLE_ENFORCE_GT(end, start, - platform::errors::InvalidArgument( - "end should greater than start, but received " - "end = %d, start = %d.", - ends[i], starts[i])); - out_dims[axes[i]] = end - start; - } - } - } - // generate new shape - if (decrease_axis.size() > 0) { - std::vector new_out_shape; - for (size_t i = 0; i < decrease_axis.size(); ++i) { - if (ctx->IsRuntime() && infer_flags[i] != -1) { - PADDLE_ENFORCE_EQ( - out_dims[decrease_axis[i]], 1, - platform::errors::InvalidArgument("decrease dim should be 1")); - } - out_dims[decrease_axis[i]] = 0; - } + CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, nullptr, + &infer_flags); - for (int i = 0; i < out_dims.size(); ++i) { - if (out_dims[i] != 0) { - new_out_shape.push_back(out_dims[i]); - } - } - if (new_out_shape.size() == 0) { - new_out_shape.push_back(1); - } - - out_dims = framework::make_ddim(new_out_shape); + auto slice_dims = + GetSliceDims(in_dims, axes, starts, ends, nullptr, &infer_flags); + if (ctx->IsRuntime()) { + out_dims = GetDecreasedDims(slice_dims, decrease_axis, &infer_flags); + } else { + out_dims = GetDecreasedDims(slice_dims, decrease_axis, nullptr); } + ctx->SetOutputDim("Out", out_dims); if (axes[0] != 0) { ctx->ShareLoD("Input", /*->*/ "Out"); @@ -185,6 +137,7 @@ class SliceOp : public framework::OperatorWithKernel { return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace()); } + framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) const override { @@ -436,9 +389,9 @@ REGISTER_OP_CPU_KERNEL( ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, + paddle::platform::complex>, ops::SliceKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( slice_grad, ops::SliceGradKernel, @@ -446,6 +399,31 @@ REGISTER_OP_CPU_KERNEL( ops::SliceGradKernel, ops::SliceGradKernel, ops::SliceGradKernel, + paddle::platform::complex>, ops::SliceGradKernel); + paddle::platform::complex>); + +REGISTER_OP_CUDA_KERNEL( + slice, ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel>, + ops::SliceKernel>); + +REGISTER_OP_CUDA_KERNEL( + slice_grad, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel>, + ops::SliceGradKernel>); diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu deleted file mode 100644 index 5f80d3cc971f5413b8cb6f64cfa860af9013fa2b..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/slice_op.cu +++ /dev/null @@ -1,39 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/slice_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - slice, ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel, - ops::SliceKernel); - -REGISTER_OP_CUDA_KERNEL( - slice_grad, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel, - ops::SliceGradKernel); diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h index 22f6fa9e3e6f206b33c46369086d1637fdc83457..96b8ea11d6845eb1b07cc05f1363ff34681d2071 100644 --- a/paddle/fluid/operators/slice_op.h +++ b/paddle/fluid/operators/slice_op.h @@ -17,22 +17,69 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/slice_utils.h" #include "paddle/fluid/operators/utils.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using Variable = framework::Variable; +using LoDTensorArray = framework::LoDTensorArray; +using DDim = framework::DDim; + +inline void DealTensorArray(const framework::ExecutionContext& ctx, + const std::vector& starts, + const std::vector& ends, + bool out_is_array) { + auto in_array = ctx.Input("Input"); + // If the input is LoDTensorArray, the rank of input is 1. + int64_t in_size = in_array->size(); + int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0]; + int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0]; + + start = std::max(start, static_cast(0)); + end = std::max(end, static_cast(0)); + end = std::min(end, in_size); + + PADDLE_ENFORCE_GT(end, start, + platform::errors::InvalidArgument( + "Attr(ends) should be greater than attr(starts) in " + "slice op. But received end = %d, start = %d.", + ends[0], starts[0])); + int64_t out_size = end - start; + + if (out_is_array) { + auto out_array = ctx.Output("Out"); + out_array->resize(out_size); + + for (int i = 0; i < out_size; ++i) { + auto* out_tensor = &out_array->at(i); + auto in_tensor = in_array->at(i + start); + out_tensor->set_lod(in_tensor.lod()); + if (in_tensor.memory_size() > 0) { + TensorCopy(in_tensor, ctx.GetPlace(), out_tensor); + } else { + VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << i << "]."; + } + } + } else { + auto out = ctx.Output("Out"); + auto in_tensor = in_array->at(start); + TensorCopy(in_tensor, ctx.GetPlace(), out); + } +} template class SliceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Variable* input_var = ctx.InputVar("Input"); - bool is_tensor_array = input_var->IsType(); - int rank = is_tensor_array - ? 1 - : ctx.Input("Input")->dims().size(); + const Variable* input_var = ctx.InputVar("Input"); + bool is_tensor_array = input_var->IsType(); + int rank = is_tensor_array ? 1 : ctx.Input("Input")->dims().size(); switch (rank) { case 1: @@ -53,53 +100,45 @@ class SliceKernel : public framework::OpKernel { case 6: SliceCompute<6>(ctx); break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "The rank of input should be less than 7, but received %d.", rank)); } } private: template - void SliceCompute(const framework::ExecutionContext& context) const { - auto& place = - *context.template device_context().eigen_device(); - const framework::Variable* input_var = context.InputVar("Input"); - framework::Variable* out_var = context.OutputVar("Out"); - bool input_is_tensor_array = input_var->IsType(); - bool out_is_tensor_array = out_var->IsType(); - - auto axes = context.Attr>("axes"); - - auto starts_int = context.Attr>("starts"); + void SliceCompute(const framework::ExecutionContext& ctx) const { + const Variable* input_var = ctx.InputVar("Input"); + Variable* out_var = ctx.OutputVar("Out"); + bool input_is_array = input_var->IsType(); + bool out_is_array = out_var->IsType(); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + std::vector axes(axes_int.begin(), axes_int.end()); std::vector starts(starts_int.begin(), starts_int.end()); - auto ends_int = context.Attr>("ends"); std::vector ends(ends_int.begin(), ends_int.end()); - auto decrease_axis = context.Attr>("decrease_axis"); - auto infer_flags = context.Attr>("infer_flags"); - auto list_new_ends_tensor = - context.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - context.MultiInput("StartsTensorList"); - - bool need_infer = false; - if (context.HasInput("StartsTensor") || context.HasInput("EndsTensor")) { - need_infer = true; - } - if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) { - need_infer = true; + + auto decrease_axis = ctx.Attr>("decrease_axis"); + auto infer_flags = ctx.Attr>("infer_flags"); + + // Step 1: Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); } - if (need_infer) { - if (context.HasInput("StartsTensor")) { - auto* starts_tensor = context.Input("StartsTensor"); - starts = GetDataFromTensor(starts_tensor); - } else if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } - if (context.HasInput("EndsTensor")) { - auto* ends_tensor = context.Input("EndsTensor"); - ends = GetDataFromTensor(ends_tensor); - } else if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } + + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); } + PADDLE_ENFORCE_EQ( starts.size(), axes.size(), platform::errors::InvalidArgument( @@ -108,173 +147,74 @@ class SliceKernel : public framework::OpKernel { ends.size(), axes.size(), platform::errors::InvalidArgument( "The size of ends must be equal to the size of axes.")); - if (input_is_tensor_array) { - auto in_array = context.Input("Input"); - // If the input is LoDTensorArray, the rank of input is 1. - int64_t in_size = in_array->size(); - int64_t start = starts[0] < 0 ? (starts[0] + in_size) : starts[0]; - int64_t end = ends[0] < 0 ? (ends[0] + in_size) : ends[0]; - - start = std::max(start, static_cast(0)); - end = std::max(end, static_cast(0)); - end = std::min(end, in_size); - - PADDLE_ENFORCE_GT(end, start, - platform::errors::InvalidArgument( - "Attr(ends) should be greater than attr(starts) in " - "slice op. But received end = %d, start = %d.", - ends[0], starts[0])); - int64_t out_size = end - start; - - if (out_is_tensor_array) { - auto out_array = context.Output("Out"); - out_array->resize(out_size); - - for (int i = 0; i < out_size; ++i) { - auto* out_tensor = &out_array->at(i); - auto in_tensor = in_array->at(i + start); - out_tensor->set_lod(in_tensor.lod()); - if (in_tensor.memory_size() > 0) { - TensorCopy(in_tensor, context.GetPlace(), out_tensor); - } else { - VLOG(10) - << "WARNING: The input tensor 'x_tensor' holds no memory, so " - "nothing has been written to output array[" - << i << "]."; - } - } - } else { - auto out = context.Output("Out"); - auto in_tensor = in_array->at(start); - TensorCopy(in_tensor, context.GetPlace(), out); - } + // Step 2: Compute output + if (input_is_array) { + DealTensorArray(ctx, starts, ends, out_is_array); return; - } + } else { + auto in = ctx.Input("Input"); + auto out = ctx.Output("Out"); - auto in = context.Input("Input"); - auto out = context.Output("Out"); + auto in_dims = in->dims(); + auto out_dims = out->dims(); + auto slice_dims = out_dims; - auto out_dims = out->dims(); - auto in_dims = in->dims(); - if (need_infer) { - out_dims = in_dims; - int64_t dim_value, start, end; + // 2.1 Infer output dims for (size_t i = 0; i < axes.size(); ++i) { - dim_value = out_dims[axes[i]]; - if (dim_value > 0) { - // when end = start+1 and start == -1 - if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { - auto ret = - std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); - if (ret != decrease_axis.end()) { - ends[i] = 10000000; - } + // when start == -1 && end == start+1 + if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { + auto ret = + std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); + if (ret != decrease_axis.end()) { + ends[i] = in_dims[axes[i]]; } - - start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; - end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; - start = std::max(start, static_cast(0)); - end = std::max(end, static_cast(0)); - end = std::min(end, dim_value); - PADDLE_ENFORCE_GT( - end, start, - platform::errors::InvalidArgument( - "Attr(ends) should be greater than attr(starts) in " - "slice op. But received end = %d, start = %d.", - ends[i], starts[i])); - out_dims[axes[i]] = end - start; } } - out->Resize(out_dims); - // generate new shape - if (decrease_axis.size() > 0) { - std::vector new_out_shape; - for (size_t i = 0; i < decrease_axis.size(); ++i) { - PADDLE_ENFORCE_EQ( - out_dims[decrease_axis[i]], 1, - platform::errors::InvalidArgument("decrease dim should be 1")); - out_dims[decrease_axis[i]] = 0; - } - for (int i = 0; i < out_dims.size(); ++i) { - if (out_dims[i] != 0) { - new_out_shape.push_back(out_dims[i]); - } - } - if (new_out_shape.size() == 0) { - new_out_shape.push_back(1); - } + CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends); + slice_dims = + GetSliceDims(in_dims, axes, starts, ends, nullptr, nullptr); + out_dims = GetDecreasedDims(slice_dims, decrease_axis); - out_dims = framework::make_ddim(new_out_shape); - } - } - - // resize out_dims - if (decrease_axis.size() > 0) { - if (decrease_axis.size() == (size_t)in_dims.size()) { - std::vector vec_origin_out_shape(decrease_axis.size(), 1); - out->Resize(framework::make_ddim(vec_origin_out_shape)); - } else { - std::vector vec_origin_out_shape( - out_dims.size() + decrease_axis.size(), -1); + // 2.2 Get output + auto offsets = Eigen::DSizes(); + auto extents = Eigen::DSizes(); - for (size_t i = 0; i < decrease_axis.size(); ++i) { - vec_origin_out_shape[decrease_axis[i]] = 1; - } - - int index = 0; - for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) { - if (vec_origin_out_shape[i] == -1) { - vec_origin_out_shape[i] = out_dims[index]; - ++index; - } - } - - out->Resize(framework::make_ddim(vec_origin_out_shape)); + for (size_t i = 0; i < D; ++i) { + offsets[i] = 0; + extents[i] = slice_dims[i]; } - } - - out->mutable_data(context.GetPlace()); - - auto new_out_dims = out->dims(); - auto offsets = Eigen::array(); - auto extents = Eigen::array(); - for (size_t i = 0; i < D; ++i) { - offsets[i] = 0; - extents[i] = new_out_dims[i]; - } - int64_t start; - for (size_t i = 0; i < axes.size(); ++i) { - start = starts[i]; - if (start < 0) { - start = (start + in_dims[axes[i]]); + for (size_t i = 0; i < axes.size(); ++i) { + offsets[axes[i]] = starts[i]; } - start = std::max(start, static_cast(0)); - offsets[axes[i]] = start; - } - auto in_t = - framework::EigenTensor::From( - *in); - auto out_t = - framework::EigenTensor::From( - *out, new_out_dims); - if (in->numel() <= Eigen::NumTraits::highest()) { - // similar to tf.slice: - // if element number less than INT_MAX, change the type of index to int - Eigen::DSizes offsets_32bit, extents_32bit; - for (size_t i = 0; i < D; i++) { - offsets_32bit[i] = offsets[i]; - extents_32bit[i] = extents[i]; + out->Resize(slice_dims); + out->mutable_data(ctx.GetPlace()); + + auto in_t = framework::EigenTensor::From(*in, in_dims); + auto out_t = framework::EigenTensor::From(*out, slice_dims); + auto& eigen_place = + *ctx.template device_context().eigen_device(); + + if (in->numel() <= Eigen::NumTraits::highest()) { + // similar to tf.slice: + // if element number less than INT_MAX, change the type of index to int + Eigen::DSizes offsets_32bit, extents_32bit; + for (size_t i = 0; i < D; i++) { + offsets_32bit[i] = offsets[i]; + extents_32bit[i] = extents[i]; + } + EigenSlice, T, D>::Eval( + eigen_place, framework::To32BitIndex(out_t), + framework::To32BitIndex(in_t), offsets_32bit, extents_32bit); + } else { + EigenSlice, T, D>::Eval( + eigen_place, out_t, in_t, offsets, extents); } - framework::To32BitIndex(out_t).device(place) = - framework::To32BitIndex(in_t).slice(offsets_32bit, extents_32bit); - } else { - out_t.device(place) = in_t.slice(offsets, extents); - } - out->Resize(out_dims); + out->Resize(out_dims); + } } }; @@ -282,11 +222,9 @@ template class SliceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - const framework::Variable* input_var = ctx.InputVar("Input"); - bool is_tensor_array = input_var->IsType(); - size_t rank = is_tensor_array - ? 1 - : ctx.Input("Input")->dims().size(); + const Variable* input_var = ctx.InputVar("Input"); + bool is_array = input_var->IsType(); + size_t rank = is_array ? 1 : ctx.Input("Input")->dims().size(); switch (rank) { case 1: @@ -307,53 +245,48 @@ class SliceGradKernel : public framework::OpKernel { case 6: SliceCompute<6>(ctx); break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "The rank of input should be less than 7, but received %d.", rank)); } } private: template - void SliceCompute(const framework::ExecutionContext& context) const { - auto axes = context.Attr>("axes"); - - auto starts_int = context.Attr>("starts"); + void SliceCompute(const framework::ExecutionContext& ctx) const { + auto axes = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); std::vector starts(starts_int.begin(), starts_int.end()); - - auto ends_int = context.Attr>("ends"); std::vector ends(ends_int.begin(), ends_int.end()); - auto list_new_ends_tensor = - context.MultiInput("EndsTensorList"); - auto list_new_starts_tensor = - context.MultiInput("StartsTensorList"); - - if (list_new_starts_tensor.size() > 0) { - starts = GetDataFromTensorList(list_new_starts_tensor); - } else if (context.HasInput("StartsTensor")) { - auto* starts_tensor = context.Input("StartsTensor"); - starts = GetDataFromTensor(starts_tensor); + // Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); } - if (list_new_ends_tensor.size() > 0) { - ends = GetDataFromTensorList(list_new_ends_tensor); - } else if (context.HasInput("EndsTensor")) { - auto* ends_tensor = context.Input("EndsTensor"); - ends = GetDataFromTensor(ends_tensor); + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); } - framework::Variable* d_input_var = - context.OutputVar(framework::GradVarName("Input")); - const framework::Variable* d_out_var = - context.InputVar(framework::GradVarName("Out")); - bool d_input_is_tensor_array = - d_input_var->IsType(); - bool d_out_is_tensor_array = d_out_var->IsType(); - - if (d_input_is_tensor_array) { - auto* input_array = context.Input("Input"); - auto* d_input_array = context.Output( - framework::GradVarName("Input")); + + Variable* d_input_var = ctx.OutputVar(framework::GradVarName("Input")); + const Variable* d_out_var = ctx.InputVar(framework::GradVarName("Out")); + bool d_input_is_array = d_input_var->IsType(); + bool d_out_is_array = d_out_var->IsType(); + + if (d_input_is_array) { + auto* input_array = ctx.Input("Input"); + auto* d_in_arr = + ctx.Output(framework::GradVarName("Input")); int64_t d_in_size = input_array->size(); - d_input_array->resize(d_in_size); + d_in_arr->resize(d_in_size); // If the input is LoDTensorArray, the rank of input is 1. // So only use the 0th element of starts. int64_t start = starts[0] < 0 ? (starts[0] + d_in_size) : starts[0]; @@ -361,68 +294,60 @@ class SliceGradKernel : public framework::OpKernel { // set zero platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& dev_ctx = *pool.Get(context.GetPlace()); - T value = T(0); + auto& dev_ctx = *pool.Get(ctx.GetPlace()); math::SetConstant functor; for (int i = 0; i < d_in_size; ++i) { auto dim = input_array->at(i).dims(); - d_input_array->at(i).Resize(dim); - d_input_array->at(i).mutable_data(context.GetPlace()); + d_in_arr->at(i).Resize(dim); + d_in_arr->at(i).mutable_data(ctx.GetPlace()); functor(reinterpret_cast(dev_ctx), - &d_input_array->at(i), static_cast(value)); + &d_in_arr->at(i), static_cast(0)); } - if (d_out_is_tensor_array) { - auto* d_out_array = context.Input( - framework::GradVarName("Out")); - int d_out_size = d_out_array->size(); + if (d_out_is_array) { + auto* d_out_arr = + ctx.Input(framework::GradVarName("Out")); + int d_out_size = d_out_arr->size(); for (int i = 0; i < d_out_size; ++i) { - TensorCopy(d_out_array->at(i), context.GetPlace(), - &(d_input_array->at(start + i))); + TensorCopy(d_out_arr->at(i), ctx.GetPlace(), + &(d_in_arr->at(start + i))); } - } else { - auto* d_out = - context.Input(framework::GradVarName("Out")); - TensorCopy(*d_out, context.GetPlace(), &(d_input_array->at(start))); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + TensorCopy(*d_out, ctx.GetPlace(), &(d_in_arr->at(start))); } return; } - auto* d_out = - context.Input(framework::GradVarName("Out")); - - auto* d_input = - context.Output(framework::GradVarName("Input")); - - d_input->mutable_data(context.GetPlace()); + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_input = ctx.Output(framework::GradVarName("Input")); + d_input->mutable_data(ctx.GetPlace()); auto out_dims = d_out->dims(); auto in_dims = d_input->dims(); - auto decrease_axis = context.Attr>("decrease_axis"); - if (decrease_axis.size() > 0) { - if (decrease_axis.size() == (size_t)in_dims.size()) { + auto decrease_axis = ctx.Attr>("decrease_axis"); + auto decrease_size = decrease_axis.size(); + if (decrease_size > 0) { + if (decrease_size == (size_t)in_dims.size()) { // all dims decrease - std::vector vec_origin_out_shape(decrease_axis.size(), 1); - out_dims = framework::make_ddim(vec_origin_out_shape); + std::vector origin_out_shape(decrease_size, 1); + out_dims = framework::make_ddim(std::vector(decrease_size, 1)); } else { - std::vector vec_origin_out_shape( - out_dims.size() + decrease_axis.size(), -1); - - for (size_t i = 0; i < decrease_axis.size(); ++i) { - vec_origin_out_shape[decrease_axis[i]] = 1; + std::vector origin_out_shape(out_dims.size() + decrease_size, -1); + for (size_t i = 0; i < decrease_size; ++i) { + origin_out_shape[decrease_axis[i]] = 1; } int index = 0; - for (size_t i = 0; i < vec_origin_out_shape.size(); ++i) { - if (vec_origin_out_shape[i] == -1) { - vec_origin_out_shape[i] = out_dims[index]; + for (size_t i = 0; i < origin_out_shape.size(); ++i) { + if (origin_out_shape[i] == -1) { + origin_out_shape[i] = out_dims[index]; ++index; } } - out_dims = framework::make_ddim(vec_origin_out_shape); + out_dims = framework::make_ddim(origin_out_shape); } } @@ -432,28 +357,26 @@ class SliceGradKernel : public framework::OpKernel { offsets[i] = 0; extents[i] = out_dims[i]; } - int64_t start; + for (size_t i = 0; i < axes.size(); ++i) { - start = starts[i]; - if (start < 0) { - start = (start + in_dims[axes[i]]); - } + int axis = axes[i]; + int64_t start = starts[i] < 0 ? (starts[i] + in_dims[axis]) : starts[i]; start = std::max(start, static_cast(0)); - offsets[axes[i]] = start; + offsets[axis] = start; } + Eigen::array, D> paddings; for (size_t i = 0; i < paddings.size(); ++i) { paddings[i].first = offsets[i]; paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i]; } - EigenPaddingCompute(context, d_input, in_dims, d_out, out_dims, paddings); + EigenPaddingCompute(ctx, d_input, in_dims, d_out, out_dims, paddings); } template void EigenPaddingCompute( - const framework::ExecutionContext& context, framework::Tensor* d_input, - const framework::DDim& in_dims, const framework::Tensor* d_out, - const framework::DDim& out_dims, + const framework::ExecutionContext& context, Tensor* d_input, + const DDim& in_dims, const Tensor* d_out, const DDim& out_dims, const Eigen::array, D>& paddings) const { if (D <= 3) { // if dimension less than 3, cannot reduce dimension @@ -509,10 +432,8 @@ class SliceGradKernel : public framework::OpKernel { out_tore_shape[1] = out_dims[pad_dim]; // convert array from std::vector to DDim - framework::DDim reshaped_in_dims = - framework::make_ddim(in_tore_shape); - framework::DDim reshaped_out_dims = - framework::make_ddim(out_tore_shape); + DDim reshaped_in_dims = framework::make_ddim(in_tore_shape); + DDim reshaped_out_dims = framework::make_ddim(out_tore_shape); // after reshape: the first dimension do not need padding, // set padding[0] zero @@ -540,10 +461,8 @@ class SliceGradKernel : public framework::OpKernel { } // convert array from std::vector to DDim - framework::DDim reshaped_in_dims = - framework::make_ddim(in_tore_shape); - framework::DDim reshaped_out_dims = - framework::make_ddim(out_tore_shape); + DDim reshaped_in_dims = framework::make_ddim(in_tore_shape); + DDim reshaped_out_dims = framework::make_ddim(out_tore_shape); // after reshape: // the first dimension is the previous padding dimension @@ -576,10 +495,8 @@ class SliceGradKernel : public framework::OpKernel { } // convert array from std::vector to DDim - framework::DDim reshaped_in_dims = - framework::make_ddim(in_tore_shape); - framework::DDim reshaped_out_dims = - framework::make_ddim(out_tore_shape); + DDim reshaped_in_dims = framework::make_ddim(in_tore_shape); + DDim reshaped_out_dims = framework::make_ddim(out_tore_shape); // after reshape: // the first dimension do not need padding, set padding[0] zero @@ -603,9 +520,8 @@ class SliceGradKernel : public framework::OpKernel { template void LaunchEigenPadding( - const framework::ExecutionContext& context, framework::Tensor* d_input, - const framework::DDim& in_dims, const framework::Tensor* d_out, - const framework::DDim& out_dims, + const framework::ExecutionContext& context, Tensor* d_input, + const DDim& in_dims, const Tensor* d_out, const DDim& out_dims, const Eigen::array, D>& paddings) const { auto& place = *context.template device_context().eigen_device(); @@ -624,10 +540,12 @@ class SliceGradKernel : public framework::OpKernel { paddings_32bit[i] = std::make_pair(paddings[i].first, paddings[i].second); } - framework::To32BitIndex(d_in_t).device(place) = - framework::To32BitIndex(d_out_t).pad(paddings_32bit, T(0)); + EigenPad, T, D>::Eval( + place, framework::To32BitIndex(d_in_t), + framework::To32BitIndex(d_out_t), paddings_32bit, static_cast(0)); } else { - d_in_t.device(place) = d_out_t.pad(paddings, T(0)); + EigenPad, T, D>::Eval( + place, d_in_t, d_out_t, paddings, static_cast(0)); } } }; diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index 9974536da9acb401a859c2c9f1d10d79eed680bb..1084eadc55c5bcaeb86a1aac5016b996beb5873b 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -25,15 +25,16 @@ namespace operators { using Tensor = framework::Tensor; -void UpdateAttr(const framework::DDim in_dims, const std::vector axes, +void UpdateAttr(const framework::DDim& in_dims, const std::vector axes, const std::vector starts, const std::vector ends, std::vector* offsets, std::vector* size) { int cnt = 0; for (int i = 0; i < in_dims.size(); ++i) { int start = 0; int end = in_dims[i]; - int axis = axes[cnt]; - + // NOTE(zhiqiu): Becareful that cnt may > axes.size() and result in + // overflow. + int axis = cnt < static_cast(axes.size()) ? axes[cnt] : -1; if (axis == i) { start = starts[cnt]; if (start < 0) { @@ -60,20 +61,75 @@ class SliceNPUKernel : public framework::OpKernel { auto* input = ctx.Input("Input"); auto* out = ctx.Output("Out"); - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + std::vector axes(axes_int.begin(), axes_int.end()); + std::vector starts(starts_int.begin(), starts_int.end()); + std::vector ends(ends_int.begin(), ends_int.end()); + + auto decrease_axis = ctx.Attr>("decrease_axis"); + auto infer_flags = ctx.Attr>("infer_flags"); + + const auto& in_dims = input->dims(); + + // Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); + } + + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); + } + + PADDLE_ENFORCE_EQ( + starts.size(), axes.size(), + platform::errors::InvalidArgument( + "The size of starts must be equal to the size of axes.")); + PADDLE_ENFORCE_EQ( + ends.size(), axes.size(), + platform::errors::InvalidArgument( + "The size of ends must be equal to the size of axes.")); + + if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") || + starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) { + // Infer output dims + auto out_dims = out->dims(); + auto slice_dims = out_dims; + for (size_t i = 0; i < axes.size(); ++i) { + // when start == -1 && end == start+1 + if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { + auto ret = + std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); + if (ret != decrease_axis.end()) { + ends[i] = in_dims[axes[i]]; + } + } + } + + CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends); + slice_dims = + GetSliceDims(in_dims, axes, starts, ends, nullptr, nullptr); + out_dims = GetDecreasedDims(slice_dims, decrease_axis); + + out->Resize(out_dims); + } out->mutable_data(ctx.GetPlace()); - auto in_dims = input->dims(); std::vector offsets(in_dims.size()); std::vector size(in_dims.size()); UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); - auto runner = NpuOpRunner("SliceD", {*input}, {*out}, - {{"offsets", offsets}, {"size", size}}); + const auto& runner = NpuOpRunner("SliceD", {*input}, {*out}, + {{"offsets", offsets}, {"size", size}}); auto stream = ctx.template device_context() @@ -90,11 +146,29 @@ class SliceGradNPUKernel : public framework::OpKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dinput = ctx.Output(framework::GradVarName("Input")); - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + std::vector axes(axes_int.begin(), axes_int.end()); + std::vector starts(starts_int.begin(), starts_int.end()); + std::vector ends(ends_int.begin(), ends_int.end()); + + // Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); + } + + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); + } - auto in_dims = input->dims(); + const auto& in_dims = input->dims(); int rank = in_dims.size(); std::vector offsets(rank); @@ -111,7 +185,7 @@ class SliceGradNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); - auto runner = + const auto& runner = NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}}); runner.Run(stream); } diff --git a/paddle/fluid/operators/slice_utils.h b/paddle/fluid/operators/slice_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..60782a9a9248f8b07b2953f7cf54a1329b137687 --- /dev/null +++ b/paddle/fluid/operators/slice_utils.h @@ -0,0 +1,143 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +inline void CheckAndUpdateSliceAttrs(const framework::DDim in_dims, + const std::vector& axes, + std::vector* starts, + std::vector* ends, + std::vector* steps = nullptr, + std::vector* infer_flags = nullptr) { + for (size_t i = 0; i < axes.size(); ++i) { + T axis = axes[i]; + T dim_value = in_dims[axis]; + + if (dim_value > 0) { + if (infer_flags != nullptr && (*infer_flags)[i] == -1) { + continue; + } + T start = (*starts)[i] < 0 ? ((*starts)[i] + dim_value) : (*starts)[i]; + start = std::max(start, static_cast(0)); + + T end = (*ends)[i] < 0 ? ((*ends)[i] + dim_value) : (*ends)[i]; + end = std::min(end, dim_value); + + T step = steps == nullptr ? 1 : (*steps)[i]; + PADDLE_ENFORCE_NE( + step, 0, platform::errors::InvalidArgument( + "Step should not be 0, but received step = %d.", step)); + + if (step > 0) { + start = std::min(start, dim_value); + end = std::max(end, static_cast(0)); + PADDLE_ENFORCE_GT( + end, start, + platform::errors::InvalidArgument( + "When step > 0, end should be greater than start, but " + "received end = %d, start = %d.", + end, start)); + } else { + // NOTE(liym27): When step < 0, start should less and equal to + // dim_value-1 + // "end is -1" means contain the 0-th element of this axis. + start = std::min(start, dim_value - 1); + end = std::max(end, static_cast(-1)); + PADDLE_ENFORCE_GT( + start, end, + platform::errors::InvalidArgument( + "When step < 0, start should be greater than end, but " + "received start = %d, end = %d.", + start, end)); + } + + (*starts)[i] = start; + (*ends)[i] = end; + } + } +} + +template +inline framework::DDim GetSliceDims(const framework::DDim in_dims, + const std::vector& axes, + const std::vector& starts, + const std::vector& ends, + std::vector* steps = nullptr, + std::vector* infer_flags = nullptr) { + framework::DDim slice_dims(in_dims); + + for (size_t i = 0; i < axes.size(); ++i) { + T axis = axes[i]; + if (infer_flags != nullptr && (*infer_flags)[i] == -1) { + slice_dims[axis] = -1; + continue; + } + + T start = starts[i]; + T end = ends[i]; + T step = steps == nullptr ? 1 : (*steps)[i]; + + if (step > 0) { + slice_dims[axis] = (end - start + step - 1) / step; + } else { + slice_dims[axis] = (end - start + step + 1) / step; + } + } + return slice_dims; +} + +template +inline framework::DDim GetDecreasedDims(const framework::DDim slice_dims, + const std::vector& decrease_axes, + std::vector* infer_flags = nullptr) { + framework::DDim decreased_dims(slice_dims); + if (decrease_axes.size() > 0) { + for (size_t i = 0; i < decrease_axes.size(); ++i) { + T axis = decrease_axes[i]; + if (infer_flags && (*infer_flags)[i] != -1) { + PADDLE_ENFORCE_EQ( + decreased_dims[axis], 1, + platform::errors::InvalidArgument("decrease dim should be 1")); + } + decreased_dims[axis] = 0; + } + + std::vector new_shape; + for (int i = 0; i < decreased_dims.size(); ++i) { + if (decreased_dims[i] != 0) { + new_shape.push_back(decreased_dims[i]); + } + } + + // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and + // uses [1] instead. + if (new_shape.size() == 0) { + new_shape.push_back(1); + } + + decreased_dims = framework::make_ddim(new_shape); + } + return decreased_dims; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 08266318fb970ba976269991351152c22b38dbf2..68a1649d0a039d8b63b4811f1e7606b0c071fb9d 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -65,6 +65,9 @@ class SoftmaxKernel : public framework::OpKernel { // allocate memory on device. Out->mutable_data(context.GetPlace()); + if (Out->numel() == 0) { + return; + } const int n = SizeToAxis(axis, X->dims()); const int d = SizeFromAxis(axis, X->dims()); @@ -97,6 +100,9 @@ class SoftmaxGradKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); + if (dX->numel() == 0) { + return; + } const int n = SizeToAxis(axis, dX->dims()); const int d = SizeFromAxis(axis, dX->dims()); diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc index 0e94f6af232f98e093953e1aee37306eb460211d..212b600fda1ae88588d6401e9407268a995ad752 100644 --- a/paddle/fluid/operators/softmax_op_npu.cc +++ b/paddle/fluid/operators/softmax_op_npu.cc @@ -31,7 +31,7 @@ class SoftmaxNPUKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); - auto runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input); + const auto& runner = NpuOpRunner("SoftmaxV2", {*in}, {*out}, attr_input); auto stream = ctx.template device_context() @@ -71,8 +71,8 @@ class SoftmaxGradNPUKernel : public framework::OpKernel { dX->mutable_data(ctx.GetPlace()); framework::NPUAttributeMap attr_input = {}; - auto runner = NpuOpRunner(std::string("SoftmaxGrad"), {tmp_out, tmp_dOut}, - {*dX}, attr_input); + const auto& runner = NpuOpRunner(std::string("SoftmaxGrad"), + {tmp_out, tmp_dOut}, {*dX}, attr_input); auto stream = ctx.template device_context() diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc index ed7034ef6ab416a4e98ddcd02f045af459298d65..3527478f7661058e193d14d95f815beb28f1e92a 100644 --- a/paddle/fluid/operators/softmax_op_xpu.cc +++ b/paddle/fluid/operators/softmax_op_xpu.cc @@ -47,8 +47,8 @@ class SoftmaxXPUKernel : public framework::OpKernel { int len = x->numel(); T* clip_x_data = clip_x.mutable_data(context.GetPlace(), len * sizeof(T)); - r = xpu::clip(dev_ctx.x_context(), x->data(), clip_x_data, len, - -1e30, 1e30); + r = xpu::clip_v2(dev_ctx.x_context(), x->data(), clip_x_data, len, + static_cast(-1e20), static_cast(1e20)); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External("XPU API(clip) return wrong " "value[%d %s]", diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index e58b39252ce5f443ca473ef7a720881e375bb0b7..0c2d39e7519ef473f01de5671f0035d7acde6dd4 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -44,6 +44,19 @@ class SoftmaxWithCrossEntropyOpMaker "The outputs value of softmax activation by given the input batch, " "which will be used in backward calculation.") .AsIntermediate(); +#ifdef PADDLE_WITH_ASCEND_CL + AddOutput( + "Backprop", + "(Tensor, default: Tensor), A tensor in same shape with " + "Input(Logits). " + "The intermediate value used for backward calculation. The calculation " + "is :" + "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, " + "where labels is ont-hot." + "Currently, the tensor is generated and used in npu kernel only. ") + .AsIntermediate() + .AsDispensable(); +#endif AddOutput("Loss", "(Tensor, default: Tensor), A tensor in same shape with " "Input(Logits) " @@ -55,7 +68,7 @@ class SoftmaxWithCrossEntropyOpMaker "the given labels as soft labels.") .SetDefault(false); AddAttr( - "softmax_switch", + "use_softmax", "(bool, default: true), A flag to indicate whether to do softmax ") .SetDefault(true); AddAttr( @@ -181,7 +194,10 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { } ctx->SetOutputDim("Softmax", logits_dims); - +#ifdef PADDLE_WITH_ASCEND_CL + ctx->SetOutputDim("Backprop", logits_dims); + ctx->ShareLoD("Logits", /*->*/ "Backprop"); +#endif logits_dims[axis] = 1; ctx->SetOutputDim("Loss", logits_dims); @@ -285,6 +301,9 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker { grad_op->SetType("softmax_with_cross_entropy_grad"); grad_op->SetInput("Label", this->Input("Label")); grad_op->SetInput("Softmax", this->Output("Softmax")); +#ifdef PADDLE_WITH_ASCEND_CL + grad_op->SetInput("Backprop", this->Output("Backprop")); +#endif grad_op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); grad_op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits")); @@ -317,10 +336,29 @@ REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy, REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad, ops::SoftmaxWithCrossEntropyGradKernel, ops::SoftmaxWithCrossEntropyGradKernel); + REGISTER_OP_VERSION(softmax_with_cross_entropy) +#ifdef PADDLE_WITH_ASCEND_CL + .AddCheckpoint( + R"ROC( + Add a new attribute [use_softmax] )ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "use_softmax", "A flag to indicate whether to do softmax", true)) + .AddCheckpoint( + R"ROC( + Add a new dispensable/intermediate output [backprop] )ROC", + paddle::framework::compatible::OpVersionDesc().NewOutput( + "Backprop", + "The intermediate value used for backward calculation. The " + "calculation is :" + "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, " + "where labels is ont-hot." + "Currently, the tensor is generated and used in npu kernel " + "only. ")); +#else .AddCheckpoint( R"ROC( - Add a new attribute [softmax_switch] )ROC", + Add a new attribute [use_softmax] )ROC", paddle::framework::compatible::OpVersionDesc().NewAttr( - "softmax_switch", "A flag to indicate whether to do softmax", - true)); + "use_softmax", "A flag to indicate whether to do softmax", true)); +#endif diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 140059256c3cc954a56dbae24804d446e7d46ce9..4aec4c174227921d6b396033d26550145dbd6bb2 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -772,10 +772,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { platform::is_gpu_place(context.GetPlace()), true, platform::errors::Unavailable("softmax_with_cross_entropy operator's " "CUDA kernel only runs on GPU device.")); - const bool softmax_switch = context.Attr("softmax_switch"); + const bool use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax - if (!softmax_switch) { + if (!use_softmax) { const Tensor* softmax = context.Input("Logits"); const Tensor* labels = context.Input("Label"); Tensor* softmax_out = context.Output("Softmax"); @@ -925,10 +925,10 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { int block = 512; auto stream = context.cuda_device_context().stream(); auto ignore_index = context.Attr("ignore_index"); - auto softmax_switch = context.Attr("softmax_switch"); + auto use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax - if (!softmax_switch) { + if (!use_softmax) { if (context.Attr("soft_label")) { int grid = (n * d + block - 1) / block; const T* label_data = labels->data(); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index 55b811cbe31e40bf26ef826b5445bfcaba57bbdc..74316841a13b1771cbe815b6b0180a4747e9df70 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -31,10 +31,10 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( platform::is_cpu_place(context.GetPlace()), true, platform::errors::Unimplemented("This kernel only runs on CPU.")); - const bool softmax_switch = context.Attr("softmax_switch"); + const bool use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax - if (!softmax_switch) { + if (!use_softmax) { const Tensor* softmax = context.Input("Logits"); const Tensor* labels = context.Input("Label"); Tensor* softmax_out = context.Output("Softmax"); @@ -113,9 +113,9 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); - const bool softmax_switch = context.Attr("softmax_switch"); + const bool use_softmax = context.Attr("use_softmax"); - if (logit_grad != softmax || !softmax_switch) { + if (logit_grad != softmax || !use_softmax) { framework::TensorCopy(*softmax, context.GetPlace(), context.device_context(), logit_grad); } @@ -138,8 +138,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { auto logit_grad_mat = framework::EigenMatrix::From(logit_grad_2d); auto& place = *context.template device_context() .eigen_device(); - if (!softmax_switch) { - // softmax_switch step1 + if (!use_softmax) { + // use_softmax step1 if (soft_label) { auto lbl_mat = framework::EigenMatrix::From(labels_2d); logit_grad_mat.device(place) = @@ -148,7 +148,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * logit_grad_mat; } - // softmax_switch step2 + // use_softmax step2 else { const int64_t* label_data = labels->data(); T* logit_grad_data = logit_grad->data(); @@ -181,7 +181,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { return; } - // for softmax_switch=False, continue + // for use_softmax=False, continue if (soft_label) { // when soft_label = True, ignore_index is not supported diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc index a34946315f5a81d04956735ce5b89b72761a6d0f..639fc6fcc2e79b265e6fda48303db6603ef12401 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc @@ -32,80 +32,53 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel { auto* labels = ctx.Input("Label"); auto* softmax = ctx.Output("Softmax"); auto* loss = ctx.Output("Loss"); + auto* backprop = ctx.Output("Backprop"); + auto soft_label = ctx.Attr("soft_label"); + PADDLE_ENFORCE_EQ(soft_label, false, + platform::errors::Unimplemented( + "soft_label=True is not supported in " + "the npu kernel of softmax_with_cross_entropy.")); - int cls_num = logits->dims()[1]; const int rank = logits->dims().size(); const int axis = CanonicalAxis(ctx.Attr("axis"), rank); - std::vector axes; - for (auto i = axis; i < logits->dims().size(); ++i) { - axes.push_back(i); - } + const int n = SizeToAxis(axis, logits->dims()); + const int d = SizeFromAxis(axis, logits->dims()); + + PADDLE_ENFORCE_EQ( + labels->numel(), n, + platform::errors::Unimplemented( + "The size of labels should be equal to SizeToAxis of logits," + "but got size of labels is %d and SizeToAxis is %d.", + labels->numel(), n)); + + loss->mutable_data(ctx.GetPlace()); + backprop->mutable_data(ctx.GetPlace()); + softmax->mutable_data(ctx.GetPlace()); + + Tensor logits_2d, labels_1d, loss_1d, backprop_2d, softmax_2d; + logits_2d.ShareDataWith(*logits).Resize({n, d}); + labels_1d.ShareDataWith(*labels).Resize({n}); + loss_1d.ShareDataWith(*loss).Resize({n}); + backprop_2d.ShareDataWith(*backprop).Resize({n, d}); + softmax_2d.ShareDataWith(*softmax).Resize({n, d}); auto stream = ctx.template device_context() .stream(); - // softmax - softmax->mutable_data(ctx.GetPlace()); - auto runner_softmax = + std::vector axes; + for (auto i = axis; i < logits->dims().size(); ++i) { + axes.push_back(i); + } + const auto& runner_softmax = NpuOpRunner("SoftmaxV2", {*logits}, {*softmax}, {{"axes", axes}}); runner_softmax.Run(stream); - // cast label from int64/int32 to int32 - Tensor tmp_labels(framework::proto::VarType::INT32); - if (labels->type() != framework::proto::VarType::INT32) { - tmp_labels.Resize(labels->dims()); - tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32); - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); - auto runner_cast_label = - NpuOpRunner("Cast", {*labels}, {tmp_labels}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_label.Run(stream); - labels = &tmp_labels; - } - - // on and off - Tensor on_tensor(framework::proto::VarType::INT32); - on_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&on_tensor, static_cast(1)); - Tensor off_tensor(framework::proto::VarType::INT32); - off_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&off_tensor, static_cast(0)); - - // one_hot - Tensor tmp_onehot(on_tensor.type()); - tmp_onehot.Resize(logits->dims()); - tmp_onehot.mutable_data(ctx.GetPlace()); - - auto runner_onehot = - NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot}, - {{"axis", -1}, {"depth", cls_num}}); - runner_onehot.Run(stream); - - // cast one_hot from int32 to T - Tensor cast_onehot(logits->type()); - cast_onehot.Resize(tmp_onehot.dims()); - cast_onehot.mutable_data(ctx.GetPlace()); - auto dst_dtype = ConvertToNpuDtype(logits->type()); - auto runner_cast_onehot = - NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_onehot.Run(stream); - - // SoftmaxCrossEntropyWithLogits - Tensor backprop(logits->type()); - backprop.Resize(logits->dims()); - backprop.mutable_data(ctx.GetPlace()); - - loss->mutable_data(ctx.GetPlace()); - - // SoftmaxCrossEntropyWithLogits requires loss to be of shape [batch_size] - auto loss_dims = loss->dims(); - loss->Resize({loss_dims[0]}); - auto runner_s = NpuOpRunner("SoftmaxCrossEntropyWithLogits", - {*logits, cast_onehot}, {*loss, backprop}, {}); + // SparseSoftmaxCrossEntropyWithLogits + const auto& runner_s = + NpuOpRunner("SparseSoftmaxCrossEntropyWithLogits", + {logits_2d, labels_1d}, {loss_1d, backprop_2d}, {}); runner_s.Run(stream); - loss->Resize(loss_dims); } }; @@ -113,70 +86,32 @@ template class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* labels = ctx.Input("Label"); - auto* softmax = ctx.Input("Softmax"); + auto* backprop = ctx.Input("Backprop"); auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); - int cls_num = softmax->dims()[1]; + PADDLE_ENFORCE_NOT_NULL(backprop, + platform::errors::PreconditionNotMet( + "backprop should not be null in NPU kernel of " + "softmax_with_cross_entropy_grad.")); + logits_grad->mutable_data(ctx.GetPlace()); + + const int rank = logits_grad->dims().size(); + const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int n = SizeToAxis(axis, logits_grad->dims()); + const int d = SizeFromAxis(axis, logits_grad->dims()); + + Tensor logits_grad_2d, loss_grad_1d, backprop_2d; + + logits_grad_2d.ShareDataWith(*logits_grad).Resize({n, d}); + loss_grad_1d.ShareDataWith(*loss_grad).Resize({n}); + backprop_2d.ShareDataWith(*backprop).Resize({n, d}); auto stream = ctx.template device_context() .stream(); - - // cast label from int64/int32 to int32 - Tensor tmp_labels(framework::proto::VarType::INT32); - if (labels->type() != framework::proto::VarType::INT32) { - tmp_labels.Resize(labels->dims()); - tmp_labels.mutable_data(ctx.GetPlace(), framework::proto::VarType::INT32); - auto dst_dtype = ConvertToNpuDtype(framework::proto::VarType::INT32); - auto runner_cast_label = - NpuOpRunner("Cast", {*labels}, {tmp_labels}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_label.Run(stream); - labels = &tmp_labels; - } - - // on and off - Tensor on_tensor(framework::proto::VarType::INT32); - on_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&on_tensor, static_cast(1)); - Tensor off_tensor(framework::proto::VarType::INT32); - off_tensor.mutable_data({1}, ctx.GetPlace()); - FillNpuTensorWithConstant(&off_tensor, static_cast(0)); - - // one_hot - Tensor tmp_onehot(on_tensor.type()); - tmp_onehot.Resize(softmax->dims()); - tmp_onehot.mutable_data(ctx.GetPlace()); - - auto runner_onehot = - NpuOpRunner("OneHotD", {*labels, on_tensor, off_tensor}, {tmp_onehot}, - {{"axis", -1}, {"depth", cls_num}}); - runner_onehot.Run(stream); - - // cast one_hot from int32 to T - Tensor cast_onehot(softmax->type()); - cast_onehot.Resize(tmp_onehot.dims()); - cast_onehot.mutable_data(ctx.GetPlace()); - auto dst_dtype = ConvertToNpuDtype(softmax->type()); - auto runner_cast_onehot = - NpuOpRunner("Cast", {tmp_onehot}, {cast_onehot}, - {{"dst_type", static_cast(dst_dtype)}}); - runner_cast_onehot.Run(stream); - - // sub - Tensor tmp_sub(softmax->type()); - tmp_sub.Resize(softmax->dims()); - tmp_sub.mutable_data(ctx.GetPlace()); - auto runner_sub = - NpuOpRunner("Sub", {*softmax, cast_onehot}, {tmp_sub}, {}); - - runner_sub.Run(stream); - // mul - logits_grad->mutable_data(ctx.GetPlace()); - auto runner_mul = - NpuOpRunner("Mul", {*loss_grad, tmp_sub}, {*logits_grad}, {}); + const auto& runner_mul = + NpuOpRunner("Mul", {*loss_grad, *backprop}, {*logits_grad}, {}); runner_mul.Run(stream); } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc index 8635def2ecf138550bf02f0013b31b59647777b9..a79e31eb8d028d3d319176e397ba5da9da54cd0e 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc @@ -54,8 +54,9 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { int len = logits->numel(); T* clip_logits_data = clip_logits.mutable_data(context.GetPlace(), len * sizeof(T)); - r = xpu::clip(dev_ctx.x_context(), logits->data(), clip_logits_data, - len, -1e30, 1e30); + r = xpu::clip_v2(dev_ctx.x_context(), logits->data(), + clip_logits_data, len, static_cast(-1e20), + static_cast(1e20)); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, platform::errors::External("XPU kernel error. clip " diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index 0151778075de04c773cb4b7443d0aa2f28fdeadc..f81ac8882d1076a0999acc0810a0a387028d6c7c 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -73,8 +73,26 @@ class SplitOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // OneDNN uses blocking format, which cannot be always + // supported with reorders, because if blocked dimension is not divisible + // by + // 8 or 16(depending on which blocking format is used) submemory cannot be + // created, so in that scenario a fallback is needed + auto tmp_md = dnnl::memory::desc( + framework::vectorize(ctx.Input("X")->dims()), + dnnl::memory::data_type::f32, ctx.Input("X")->format()); + if (tmp_md.data.format_desc.blocking.inner_nblks == 0) + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( @@ -136,6 +154,14 @@ Example: "(int, default 0) " "The axis which the input will be split on.") .SetDefault(0); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); } }; diff --git a/paddle/fluid/operators/squeeze_op_npu.cc b/paddle/fluid/operators/squeeze_op_npu.cc index 33c9273e3b6f50038a738744d47db1ae246d25f8..d72827d28099afaff43eea474e69327c1c62cf24 100644 --- a/paddle/fluid/operators/squeeze_op_npu.cc +++ b/paddle/fluid/operators/squeeze_op_npu.cc @@ -12,11 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL -#include -#include - -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/squeeze_op.h" namespace ops = paddle::operators; @@ -40,4 +35,21 @@ REGISTER_OP_NPU_KERNEL( ops::SqueezeKernel, ops::SqueezeKernel, ops::SqueezeKernel); -#endif +REGISTER_OP_NPU_KERNEL( + squeeze_grad, ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel); +REGISTER_OP_NPU_KERNEL( + squeeze2_grad, ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel); diff --git a/paddle/fluid/operators/stack_op.cu b/paddle/fluid/operators/stack_op.cu index 4800f5f9eb533c047ef53755b88bf2d2f288e99c..9e5e45f4d22d919e9fd037b7d32e1408a5e092dc 100644 --- a/paddle/fluid/operators/stack_op.cu +++ b/paddle/fluid/operators/stack_op.cu @@ -96,9 +96,10 @@ class StackGPUKernel : public framework::OpKernel { }; template -__global__ void UnStackCUDAKernel(const T* __restrict__ input, int pre_dim_size, - int split_dim_size, int suf_dim_size, - int num_split, T** output_ptrs) { +__global__ void UnStackHelperCUDAKernel(const T* __restrict__ input, + int pre_dim_size, int split_dim_size, + int suf_dim_size, int num_split, + T** output_ptrs) { assert(blockDim.y == 1); assert(blockDim.z == 1); // In this case they are equal @@ -114,6 +115,9 @@ __global__ void UnStackCUDAKernel(const T* __restrict__ input, int pre_dim_size, IntType k = offset % suf_dim_size; T* output = output_ptrs[j / each_dim_size]; + if (output == nullptr) { + return; + } IntType output_ind = i * each_dim_size * suf_dim_size + (j % each_dim_size) * suf_dim_size + k; *(output + output_ind) = input[offset]; @@ -142,6 +146,9 @@ class StackGradGPUKernel : public framework::OpKernel { std::vector outputs(n); auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); for (size_t j = 0; j < dx.size(); ++j) { + if (dx[j] == nullptr) { + outputs[j] = nullptr; + } if (out_var_names[j] != framework::kEmptyVarName && dx[j]->numel() != 0UL) { T* ptr = dx[j]->mutable_data(ctx.GetPlace()); @@ -170,13 +177,13 @@ class StackGradGPUKernel : public framework::OpKernel { auto config = GetGpuLaunchConfig1D(dev_ctx, dy_pre * split_dim * dy_suf); if (dy->numel() < std::numeric_limits::max()) { - UnStackCUDAKernel< + UnStackHelperCUDAKernel< T, int32_t><<>>( dy_data, dy_pre, split_dim, dy_suf, split_dim, reinterpret_cast(tmp_out_data->ptr())); } else { - UnStackCUDAKernel< + UnStackHelperCUDAKernel< T, int64_t><<>>( dy_data, dy_pre, split_dim, dy_suf, split_dim, diff --git a/paddle/fluid/operators/stack_op_npu.cc b/paddle/fluid/operators/stack_op_npu.cc index 958655b1f27c680655c20e8f795fc9e4bf37251d..3b685b3ab8dbb0166d50ec521b9b93c4508dab12 100644 --- a/paddle/fluid/operators/stack_op_npu.cc +++ b/paddle/fluid/operators/stack_op_npu.cc @@ -12,15 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL -#include -#include -#include - -#include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/stack_op.h" -#include "paddle/fluid/operators/unsqueeze_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { @@ -32,64 +25,56 @@ class StackNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto x = ctx.MultiInput("X"); - int32_t N = x.size(); + auto* y = ctx.Output("Y"); + int axis = ctx.Attr("axis"); + if (axis < 0) axis += (x[0]->dims().size() + 1); + int num = static_cast(x.size()); - PADDLE_ENFORCE_GT( - N, 0, platform::errors::InvalidArgument("number of input Tensor <= 0")); + PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( + "number of input Tensor <= 0")); + + auto stream = + ctx.template device_context() + .stream(); std::vector x_list; - for (int i = 0; i < N; i++) { + for (int i = 0; i < num; i++) { x_list.push_back(*x[i]); } + y->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); + const auto& runner = + NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}}); + runner.Run(stream); + } +}; - if (axis < 0) { - axis = axis + x_list[0].dims().size() + 1; - } - auto* out = ctx.Output("Y"); +template +class StackGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto dx = ctx.MultiOutput(framework::GradVarName("X")); + int axis = ctx.Attr("axis"); + if (axis < 0) axis += dy->dims().size(); + int num = dy->dims()[axis]; - auto place = ctx.GetPlace(); + PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument( + "number of input Tensor <= 0")); auto stream = ctx.template device_context() .stream(); - out->mutable_data(place); - - if (axis != 0) { - auto x_dim = x_list[0].dims(); - std::vector vec_dim_tmp; - vec_dim_tmp.push_back(N); - for (auto i = 0; i < x_dim.size(); ++i) { - vec_dim_tmp.push_back(x_dim[i]); - } - - Tensor tmp_stack(out->type()); - tmp_stack.Resize(framework::make_ddim(vec_dim_tmp)); - tmp_stack.mutable_data(ctx.GetPlace()); - - auto runner = - NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}}); - runner.Run(stream); - - std::vector vec_trans; - for (auto i = 1; i <= x_dim.size(); ++i) { - vec_trans.push_back(i); - if (i == axis) { - vec_trans.push_back(0); - } - } - - auto runner_trans_final = - NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}}); - runner_trans_final.Run(stream); - - } else { - auto runner = - NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}}); - runner.Run(stream); + std::vector dx_list; + for (int i = 0; i < num; i++) { + dx[i]->mutable_data(ctx.GetPlace()); + dx_list.push_back(*dx[i]); } + + const auto& runner = + NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}}); + runner.Run(stream); } }; @@ -103,4 +88,8 @@ REGISTER_OP_NPU_KERNEL( ops::StackNPUKernel); -#endif +REGISTER_OP_NPU_KERNEL( + stack_grad, + ops::StackGradNPUKernel, + ops::StackGradNPUKernel); diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc index e49476e4dc7d4a0eb5d4bb996e935b30dafd55d0..f8272d550b99917e0534d0c4223b7d54e6e450b2 100644 --- a/paddle/fluid/operators/strided_slice_op.cc +++ b/paddle/fluid/operators/strided_slice_op.cc @@ -324,22 +324,24 @@ REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad, REGISTER_OP_CPU_KERNEL( strided_slice, + ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, + paddle::platform::complex>, ops::StridedSliceKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( strided_slice_grad, + ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, + paddle::platform::complex>, ops::StridedSliceGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/strided_slice_op.cu b/paddle/fluid/operators/strided_slice_op.cu index b85403b1c5bb886a1a08f084e899c7f27ab5e963..f88605fbfc86dc30b16b4c0115eff2f6e9bbdc3b 100644 --- a/paddle/fluid/operators/strided_slice_op.cu +++ b/paddle/fluid/operators/strided_slice_op.cu @@ -13,28 +13,29 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/strided_slice_op.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( strided_slice, + ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, ops::StridedSliceKernel, + paddle::platform::complex>, ops::StridedSliceKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( strided_slice_grad, - ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, ops::StridedSliceGradKernel, + paddle::platform::complex>, ops::StridedSliceGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 741f86f35848b2e626923e381bf007f351584789..0f520adba57a203fae5d3b34fb67067d01691bed 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL( sum, ops::SumKernel, ops::SumKernel, ops::SumKernel, + ops::SumKernel, ops::SumKernel); diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc index e3dc5faf46c81e71173c6f5a6ad7766067cad1c3..a6032236c01ac3042f1c1605674adac3bfaa36e2 100644 --- a/paddle/fluid/operators/sum_op_npu.cc +++ b/paddle/fluid/operators/sum_op_npu.cc @@ -35,21 +35,28 @@ class SumNPUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); int n = static_cast(x.size()); - PADDLE_ENFORCE_EQ(n > 1, true, - platform::errors::InvalidArgument( - "The size of Input(x) list must larger or equal 2")); + if (n == 1) { + TensorCopy(*x[0], place, out); + return; + } + + std::vector inputs; + std::vector names; + for (int i = 0; i < n; ++i) { + if (x[i] && x[i]->numel() > 0) { + inputs.push_back(*x[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } + } auto stream = ctx.template device_context() .stream(); - - auto runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {}); - + NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; + runner.AddInputNames(names); runner.Run(stream); - for (int i = 2; i < n; i++) { - runner = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {}); - runner.Run(stream); - } } }; diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc index 620231eb2e298480665cf4eec316f034e0cf1d1c..eb20e1c2cd2748a5ab4db28df0c4798837c7bf21 100644 --- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -250,8 +250,12 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase { auto dout_name = Input(framework::GradVarName("Out")); std::vector grad_names; + // NOTE(Aurelius84): Generating grad base name by Input("X") instead of + // fixed string to avoid incorrectly sharing same var's allocation in + // multi-thread that will cause wrong calculation result. + std::string grad_base_name = base_name + "_temp_grad_"; - LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"), + LodTensorVectorResizeFromLodTensorArray(scope, grad_base_name, Input("X"), &grad_names); auto use_stack = Attr("use_stack"); diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index ca8f6ce84fc571674fdfe6f29cbcd82a98fd8fcf..60eeb66ae7d1eca6e093432bfdc4e5f12f47f2e9 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -48,7 +48,7 @@ class DygraphInferShapeTest { void SetOpType(const std::string& op_type) { op_type_ = op_type; } void Run(std::function infer_shape) { imperative::DygraphInferShapeContext ctx( - &ins_, &outs_, &attrs_, op_type_); + &ins_, &outs_, &attrs_, {}, op_type_); infer_shape(&ctx); for (const auto& pair : expected_dims_) { auto out = outs_[pair.first][0]; diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index a7d7ea260ecdf44ab94e65f28db1294f7c57c527..07749f90ebaa29c3f618a5850ad2d72942035e95 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -22,6 +22,7 @@ limitations under the License. */ #ifdef __HIPCC__ #include #endif +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" @@ -563,15 +564,19 @@ bool SortTopk(const platform::CUDADeviceContext& ctx, const Eigen::DSizes slice_sizes{num_rows, k}; auto e_indices = framework::EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = framework::EigenMatrix::From(temp_indices); + auto e_tmp_indices = framework::EigenMatrix::From( + static_cast(temp_indices)); std::vector odims = {static_cast(num_rows), static_cast(k)}; auto dim = framework::make_ddim(odims); auto e_values = framework::EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = framework::EigenMatrix::From(temp_values); + auto e_tmp_values = + framework::EigenMatrix::From(static_cast(temp_values)); - e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes); - e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes); + EigenSlice, int64_t, 2>::Eval( + dev, e_indices, e_tmp_indices, slice_indices, slice_sizes); + EigenSlice, T, 2>::Eval( + dev, e_values, e_tmp_values, slice_indices, slice_sizes); } return true; } diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc index 684bd476b6ef21bf58a990c36b1ee6f820d82caf..ca3a5f957685d98bfdc3a008ab71d5806814b1eb 100644 --- a/paddle/fluid/operators/top_k_op_npu.cc +++ b/paddle/fluid/operators/top_k_op_npu.cc @@ -48,7 +48,7 @@ class TopkNPUKernel : public framework::OpKernel { size_t k = static_cast(ctx.Attr("k")); output->mutable_data(ctx.GetPlace()); - indices->mutable_data(ctx.GetPlace()); + indices->mutable_data(ctx.GetPlace()); // prepare assit auto dim = input->dims().size(); @@ -62,15 +62,24 @@ class TopkNPUKernel : public framework::OpKernel { {"dim", -1}, {"largest", true}}; - // run ascend - auto runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor}, - {*output, *indices}, attr_input); + Tensor tmp_indices(framework::proto::VarType::INT32); + tmp_indices.Resize(indices->dims()); + tmp_indices.mutable_data(ctx.GetPlace()); + // run ascend + const auto& runner = NpuOpRunner("TopKD", {*input, assist_seq_tensor}, + {*output, tmp_indices}, attr_input); auto stream = ctx.template device_context() .stream(); - runner.Run(stream); + + // cast indices from INT32 to INT64 + auto dst_dtype = ConvertToNpuDtype(indices->type()); + const auto& runner_cast_indices = + NpuOpRunner("Cast", {tmp_indices}, {*indices}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast_indices.Run(stream); } }; diff --git a/paddle/fluid/operators/trace_op.cc b/paddle/fluid/operators/trace_op.cc index 623d4c7fc23ba2477d720c46697760efb1dd1429..de71a089b692a9f2ea4c3c59c1fa85cbc47b1e33 100644 --- a/paddle/fluid/operators/trace_op.cc +++ b/paddle/fluid/operators/trace_op.cc @@ -167,18 +167,18 @@ REGISTER_OP_CPU_KERNEL( ops::TraceKernel, ops::TraceKernel, ops::TraceKernel, + paddle::platform::complex>, ops::TraceKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( trace_grad, ops::TraceGradKernel, ops::TraceGradKernel, ops::TraceGradKernel, ops::TraceGradKernel, ops::TraceGradKernel, + paddle::platform::complex>, ops::TraceGradKernel); + paddle::platform::complex>); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(trace) diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu index 2c2745018be40255cd35585b06303506cf4dd386..f3fe32e10a52b6fcc8bbae9f8f1b9ab4a104d8b2 100644 --- a/paddle/fluid/operators/trace_op.cu +++ b/paddle/fluid/operators/trace_op.cu @@ -14,17 +14,20 @@ #include #include +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" #include "paddle/fluid/operators/trace_op.h" namespace paddle { namespace operators { -template struct IdentityFunctor { HOSTDEVICE explicit inline IdentityFunctor() {} - HOSTDEVICE inline T operator()(const T& x) const { return x; } + template + HOSTDEVICE inline U operator()(const U& x) const { + return x; + } }; template @@ -45,9 +48,12 @@ class TraceCUDAKernel : public framework::OpKernel { auto stream = context.cuda_device_context().stream(); std::vector reduce_dims; reduce_dims.push_back(out->dims().size()); - TensorReduce>( + TensorReduce( diag, out, reduce_dims, static_cast(0), cub::Sum(), - IdentityFunctor(), stream); + IdentityFunctor(), stream); + } else { + math::SetConstant functor; + functor(context.device_context(), out, static_cast(0)); } } }; @@ -64,9 +70,9 @@ REGISTER_OP_CUDA_KERNEL( ops::TraceCUDAKernel, ops::TraceCUDAKernel, ops::TraceCUDAKernel, + paddle::platform::complex>, ops::TraceCUDAKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( trace_grad, ops::TraceGradKernel, ops::TraceGradKernel, @@ -75,6 +81,6 @@ REGISTER_OP_CUDA_KERNEL( ops::TraceGradKernel, ops::TraceGradKernel, ops::TraceGradKernel, + paddle::platform::complex>, ops::TraceGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/trace_op.h b/paddle/fluid/operators/trace_op.h index b7a6e559ed4ef6ee4cd43b9375b3531488db449d..ca9439cbed97ddb02e2e6eaa2fb89628e738576e 100644 --- a/paddle/fluid/operators/trace_op.h +++ b/paddle/fluid/operators/trace_op.h @@ -179,7 +179,7 @@ class TraceKernel : public framework::OpKernel { auto output_dims = out->dims(); - out->mutable_data(context.GetPlace()); + T* out_data = out->mutable_data(context.GetPlace()); const framework::Tensor diag = Diagonal(context, input, offset, dim1, dim2); @@ -191,6 +191,8 @@ class TraceKernel : public framework::OpKernel { auto reduce_dim = Eigen::array({1}); output.device(place) = x.sum(reduce_dim); out->Resize(output_dims); + } else { + std::fill(out_data, out_data + out->numel(), static_cast(0)); } } }; diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 465970451f5d105e6a33555ed241c4528e35d50a..95b2c13ff6c631c05ab3abd2cf582ad3603dc031 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -341,17 +341,17 @@ REGISTER_OP_CPU_KERNEL( transpose, ops::TransposeKernel, ops::TransposeKernel, ops::TransposeKernel, + paddle::platform::complex>, ops::TransposeKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( transpose_grad, ops::TransposeGradKernel, ops::TransposeGradKernel, ops::TransposeGradKernel, + paddle::platform::complex>, ops::TransposeGradKernel); + paddle::platform::complex>); REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, ops::Transpose2GradMaker, @@ -366,9 +366,9 @@ REGISTER_OP_CPU_KERNEL( ops::TransposeKernel, ops::TransposeKernel, ops::TransposeKernel, + paddle::platform::complex>, ops::TransposeKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( transpose2_grad, ops::TransposeGradKernel, @@ -376,6 +376,6 @@ REGISTER_OP_CPU_KERNEL( ops::TransposeGradKernel, ops::TransposeGradKernel, ops::TransposeGradKernel, + paddle::platform::complex>, ops::TransposeGradKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/transpose_op.cu b/paddle/fluid/operators/transpose_op.cu index afeb22bd6fa2d4e1c4d222b01d65bff8bf05a74b..a462bbb4834acc502e57e189afb23137b09b73a0 100644 --- a/paddle/fluid/operators/transpose_op.cu +++ b/paddle/fluid/operators/transpose_op.cu @@ -732,9 +732,9 @@ REGISTER_OP_CUDA_KERNEL( ops::TransposeGPUKernel, ops::TransposeGPUKernel, ops::TransposeGPUKernel, + paddle::platform::complex>, ops::TransposeGPUKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( transpose_grad, ops::TransposeGradGPUKernel, @@ -742,9 +742,9 @@ REGISTER_OP_CUDA_KERNEL( ops::TransposeGradGPUKernel, ops::TransposeGradGPUKernel, + paddle::platform::complex>, ops::TransposeGradGPUKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( transpose2, @@ -754,9 +754,9 @@ REGISTER_OP_CUDA_KERNEL( ops::TransposeGPUKernel, ops::TransposeGPUKernel, ops::TransposeGPUKernel, + paddle::platform::complex>, ops::TransposeGPUKernel); + paddle::platform::complex>); REGISTER_OP_CUDA_KERNEL( transpose2_grad, ops::TransposeGradGPUKernel, @@ -766,6 +766,6 @@ REGISTER_OP_CUDA_KERNEL( ops::TransposeGradGPUKernel, ops::TransposeGradGPUKernel, + paddle::platform::complex>, ops::TransposeGradGPUKernel); + paddle::platform::complex>); diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc index 994b8e534f85e2926481d3767f6e75892751d959..035ad5f3f314aaa00f6f717e564c1933f3b7c562 100644 --- a/paddle/fluid/operators/transpose_op_npu.cc +++ b/paddle/fluid/operators/transpose_op_npu.cc @@ -29,7 +29,7 @@ class TransposeNPUKernel : public framework::OpKernel { std::vector axis = ctx.Attr>("axis"); framework::NPUAttributeMap attr_input = {{"perm", axis}}; out->mutable_data(ctx.device_context().GetPlace()); - auto runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input); + const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input); auto stream = ctx.template device_context() .stream(); @@ -52,7 +52,8 @@ class TransposeGradNPUKernel : public framework::OpKernel { } x_grad->mutable_data(ctx.GetPlace()); framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}}; - auto runner = NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input); + const auto& runner = + NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input); auto stream = ctx.template device_context() .stream(); diff --git a/paddle/fluid/operators/tril_triu_op.cc b/paddle/fluid/operators/tril_triu_op.cc index 8fb0b3809503ecc86e33796a4bc7f7cb2d21f8bb..3e943c62e1ce17857e78e140efeb50e627e80a4e 100644 --- a/paddle/fluid/operators/tril_triu_op.cc +++ b/paddle/fluid/operators/tril_triu_op.cc @@ -105,13 +105,15 @@ REGISTER_OPERATOR(tril_triu, ops::TrilTriuOp, ops::TrilTriuOpMaker, ops::TrilTriuGradOpMaker); REGISTER_OPERATOR(tril_triu_grad, ops::TrilTriuGradOp); REGISTER_OP_CPU_KERNEL( - tril_triu, ops::TrilTriuOpKernel, + tril_triu, ops::TrilTriuOpKernel, + ops::TrilTriuOpKernel, ops::TrilTriuOpKernel, ops::TrilTriuOpKernel, ops::TrilTriuOpKernel, ops::TrilTriuOpKernel); REGISTER_OP_CPU_KERNEL( tril_triu_grad, + ops::TrilTriuGradOpKernel, ops::TrilTriuGradOpKernel, ops::TrilTriuGradOpKernel, ops::TrilTriuGradOpKernel, diff --git a/paddle/fluid/operators/tril_triu_op.cu b/paddle/fluid/operators/tril_triu_op.cu index d04acd340597928ba0fbbbebf2dfc7eda1d698ac..9cbbdeeb2ce28453f2c22d063975fa82aae5d3b3 100644 --- a/paddle/fluid/operators/tril_triu_op.cu +++ b/paddle/fluid/operators/tril_triu_op.cu @@ -18,7 +18,7 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( - tril_triu, + tril_triu, ops::TrilTriuOpKernel, ops::TrilTriuOpKernel, ops::TrilTriuOpKernel, ops::TrilTriuOpKernel, @@ -26,6 +26,7 @@ REGISTER_OP_CUDA_KERNEL( ops::TrilTriuOpKernel); REGISTER_OP_CUDA_KERNEL( tril_triu_grad, + ops::TrilTriuGradOpKernel, ops::TrilTriuGradOpKernel, ops::TrilTriuGradOpKernel, ops::TrilTriuGradOpKernel, diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b79e2152b2f3414c3e3b7794e8c07c00a2aee00 --- /dev/null +++ b/paddle/fluid/operators/trunc_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/trunc_op.h" + +namespace paddle { +namespace operators { + +class TruncOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "trunc"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "trunc"); + auto input_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", input_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class TruncOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of trunc op."); + AddOutput("Out", "(Tensor), The output tensor of trunc op."); + AddComment(R"DOC( +Trunc Operator. +Returns a new tensor with the truncated integer values of input. +$$out = trunc(x)$$ +)DOC"); + } +}; + +class TruncGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), "TruncGrad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "TruncGrad"); + + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), dout_dims); + } +}; + +template +class TruncGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("trunc_grad"); + retv->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker, + ops::TruncGradOpMaker, + ops::TruncGradOpMaker); + +REGISTER_OPERATOR(trunc_grad, ops::TruncGradOp); + +REGISTER_OP_CPU_KERNEL(trunc, ops::TruncKernel, ops::TruncKernel, + ops::TruncKernel, ops::TruncKernel); + +REGISTER_OP_CPU_KERNEL(trunc_grad, ops::TruncGradKernel, + ops::TruncGradKernel, ops::TruncGradKernel, + ops::TruncGradKernel); diff --git a/paddle/fluid/operators/trunc_op.cu b/paddle/fluid/operators/trunc_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a284e0ea6e393910c35f11a64039e6b58f2f67a2 --- /dev/null +++ b/paddle/fluid/operators/trunc_op.cu @@ -0,0 +1,115 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/trunc_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +class TruncFunctor { + public: + __device__ TruncFunctor(const T x) : x_(x) {} + __device__ T operator()() { return trunc(x_); } + + public: + const T x_; +}; + +template <> +class TruncFunctor { + public: + __device__ TruncFunctor(const int x) : x_(x) {} + __device__ int operator()() { return x_; } + + public: + const int x_; +}; + +template <> +class TruncFunctor { + public: + __device__ TruncFunctor(const int64_t x) : x_(x) {} + __device__ int64_t operator()() { return x_; } + + public: + const int64_t x_; +}; + +template +__global__ void Trunc(const T* x, T* out, int64_t N) { + CUDA_KERNEL_LOOP(index, N) { + TruncFunctor functor(x[index]); + out[index] = functor(); + } +} + +template +__global__ void TruncGrad(T* dx, int64_t N) { + CUDA_KERNEL_LOOP(index, N) { dx[index] = static_cast(0.0); } +} + +template +class TruncCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + + const auto* x_data = x->data(); + auto* out_data = out->mutable_data(context.GetPlace()); + + int64_t numel = x->numel(); + + int theads = PADDLE_CUDA_NUM_THREADS; + int blocks = (numel + theads - 1) / theads; + + Trunc<<>>(x_data, out_data, numel); + } +}; + +template +class TruncCUDAGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* dout = context.Input(framework::GradVarName("Out")); + auto* dx = context.Output(framework::GradVarName("X")); + + const auto* dout_data = dout->data(); + auto* dx_data = dx->mutable_data(context.GetPlace()); + + int64_t numel = dout->numel(); + + int theads = PADDLE_CUDA_NUM_THREADS; + int blocks = (numel + theads - 1) / theads; + + TruncGrad<<>>(dx_data, numel); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(trunc, ops::TruncCUDAKernel, + ops::TruncCUDAKernel, ops::TruncCUDAKernel, + ops::TruncCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(trunc_grad, ops::TruncCUDAGradKernel, + ops::TruncCUDAGradKernel, + ops::TruncCUDAGradKernel, + ops::TruncCUDAGradKernel); diff --git a/paddle/fluid/operators/trunc_op.h b/paddle/fluid/operators/trunc_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0f788eae5249c57b92c7558451eca641a6840a41 --- /dev/null +++ b/paddle/fluid/operators/trunc_op.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class TruncKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* x = context.Input("X"); + Tensor* out = context.Output("Out"); + + size_t numel = x->numel(); + const T* x_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + + for (size_t i = 0; i < numel; i++) { + out_data[i] = trunc(x_data[i]); + } + } +}; + +template +class TruncGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* dx = context.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(context.GetPlace()); + + int numel = dx->numel(); + memset(dx_data, 0.0, numel * sizeof(T)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 7f3190d9112c66a09b1a5c7432a06b6e4a4ead6f..1cc46e7265f63992092ab260e8cbf3f756e05db6 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -59,7 +59,7 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); - auto runner = NpuOpRunner( + const auto& runner = NpuOpRunner( "ParameterizedTruncatedNormal", {shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor}, {*out}, {{"seed", seed_var}}); diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 6efada4343ca54c0d56f98cae20963bf0182f47b..007276b16d7f2e4d184094f97a20f138b14faa37 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -18,10 +18,41 @@ limitations under the License. */ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/bfloat16.h" namespace paddle { namespace operators { +namespace { +template +inline void UniformRealDistribution(T *data, const int64_t &size, + const float &min, const float &max, + const unsigned int &seed) { + VLOG(4) << "[CPU] UniformRandomKernel"; + std::uniform_real_distribution dist(static_cast(min), + static_cast(max)); + auto engine = paddle::framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(*engine); + } +} + +template <> +inline void UniformRealDistribution(paddle::platform::bfloat16 *data, + const int64_t &size, const float &min, + const float &max, + const unsigned int &seed) { + VLOG(4) << "[CPU] UniformRandomKernel"; + std::uniform_real_distribution dist(min, max); + auto engine = paddle::framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data[i] = static_cast(dist(*engine)); + } +} +} // namespace + // It seems that Eigen::Tensor::random in GPU will SEGFAULT. // Use std::random and thrust::random(thrust is a std library in CUDA) to // implement uniform random. @@ -61,17 +92,11 @@ class CPUUniformRandomKernel : public framework::OpKernel { framework::ToTypeName(out_var->Type()))); } T *data = tensor->mutable_data(ctx.GetPlace()); - int64_t size = tensor->numel(); - std::uniform_real_distribution dist( - static_cast(ctx.Attr("min")), - static_cast(ctx.Attr("max"))); - unsigned int seed = static_cast(ctx.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } + UniformRealDistribution( + data, size, ctx.Attr("min"), ctx.Attr("max"), + static_cast(ctx.Attr("seed"))); unsigned int diag_num = static_cast(ctx.Attr("diag_num")); @@ -257,9 +282,12 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::operators::UniformRandomOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(uniform_random, - paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel); -REGISTER_OP_CPU_KERNEL(uniform_random_batch_size_like, - paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel); +REGISTER_OP_CPU_KERNEL( + uniform_random, paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel); +REGISTER_OP_CPU_KERNEL( + uniform_random_batch_size_like, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel); diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 6052e533643f3c4e5be977a87fceafa932892862..18a4154be30ac7c4c141fe1e4dc8f43a4b42aac7 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -24,9 +24,9 @@ namespace operators { using Tensor = framework::Tensor; inline std::vector GetNewDataFromShapeTensor( - const Tensor *new_data_tensor) { + const Tensor* new_data_tensor) { if (new_data_tensor->type() == framework::proto::VarType::INT64) { - auto *new_data = new_data_tensor->data(); + auto* new_data = new_data_tensor->data(); framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { TensorCopySync(*new_data_tensor, platform::CPUPlace(), @@ -37,7 +37,7 @@ inline std::vector GetNewDataFromShapeTensor( new_data + new_data_tensor->numel()); return vec_new_data; } else if (new_data_tensor->type() == framework::proto::VarType::INT32) { - auto *new_data = new_data_tensor->data(); + auto* new_data = new_data_tensor->data(); std::vector vec_new_data; framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { @@ -58,7 +58,7 @@ inline std::vector GetNewDataFromShapeTensor( } inline std::vector GetNewDataFromShapeTensorList( - const std::vector &list_new_shape_tensor) { + const std::vector& list_new_shape_tensor) { std::vector vec_new_shape; vec_new_shape.reserve(list_new_shape_tensor.size()); for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { @@ -97,6 +97,5 @@ inline std::vector GetNewDataFromShapeTensorList( return vec_new_shape; } - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..1c2f2b07ce897524467ae1877f4a3252571d0106 --- /dev/null +++ b/paddle/fluid/operators/uniform_random_op_npu.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/uniform_random_op.h" + +namespace paddle { +namespace operators { + +template +class NPUUniformRandomKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + framework::Tensor *tensor = nullptr; + auto out_var = ctx.OutputVar("Out"); + std::vector new_shape; + auto list_new_shape_tensor = + ctx.MultiInput("ShapeTensorList"); + if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) { + if (ctx.HasInput("ShapeTensor")) { + auto *shape_tensor = ctx.Input("ShapeTensor"); + new_shape = GetNewDataFromShapeTensor(shape_tensor); + } else if (list_new_shape_tensor.size() > 0) { + new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor); + } + } + + if (out_var->IsType()) { + auto *selected_rows = out_var->GetMutable(); + tensor = selected_rows->mutable_value(); + auto shape = ctx.Attr>("shape"); + if (!new_shape.empty()) shape = new_shape; + tensor->Resize(framework::make_ddim(shape)); + selected_rows->mutable_rows()->reserve(shape[0]); + } else if (out_var->IsType()) { + tensor = out_var->GetMutable(); + if (!new_shape.empty()) tensor->Resize(framework::make_ddim(new_shape)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Expected type of Output(out) in uniform_random_op must be Tensor, " + "SelectedRows. But got " + "unsupport type: %s.", + framework::ToTypeName(out_var->Type()))); + } + tensor->mutable_data(ctx.GetPlace()); + int64_t size = tensor->numel(); + + Tensor cpu_tensor(tensor->type()); + cpu_tensor.Resize(tensor->dims()); + T *data_cpu = cpu_tensor.mutable_data(platform::CPUPlace()); + + std::uniform_real_distribution dist( + static_cast(ctx.Attr("min")), + static_cast(ctx.Attr("max"))); + unsigned int seed = static_cast(ctx.Attr("seed")); + auto engine = framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data_cpu[i] = dist(*engine); + } + + unsigned int diag_num = + static_cast(ctx.Attr("diag_num")); + unsigned int diag_step = + static_cast(ctx.Attr("diag_step")); + auto diag_val = static_cast(ctx.Attr("diag_val")); + if (diag_num > 0) { + PADDLE_ENFORCE_GT( + size, (diag_num - 1) * (diag_step + 1), + platform::errors::InvalidArgument( + "ShapeInvalid: the diagonal's elements is equal (num-1) " + "* (step-1) with num %d, step %d," + "It should be smaller than %d, but received %d", + diag_num, diag_step, (diag_num - 1) * (diag_step + 1), size)); + for (int64_t i = 0; i < diag_num; ++i) { + int64_t pos = i * diag_step + i; + data_cpu[pos] = diag_val; + } + } + + // copy to NPU + framework::TensorCopy( + cpu_tensor, ctx.GetPlace(), + ctx.template device_context(), tensor); + ctx.template device_context().Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_NPU_KERNEL(uniform_random, + paddle::operators::NPUUniformRandomKernel); diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h index 2bd2a2cbf34c6ccba1e6bfd1892f0f821d0f7c72..99793ecd244cf2594a2b0b7462a492bc3f4a27af 100644 --- a/paddle/fluid/operators/unique_op.h +++ b/paddle/fluid/operators/unique_op.h @@ -405,13 +405,13 @@ class UniqueKernel : public framework::OpKernel { bool return_counts = context.Attr("return_counts"); if (axis_vec.empty()) { - framework::VisitDataTypeSmall( + framework::VisitDataTypeTiny( data_type, UniqueFlattendTensorFunctor( context, *x, out, return_index, return_inverse, return_counts)); } else { int axis = axis_vec[0]; - framework::VisitDataTypeSmall( + framework::VisitDataTypeTiny( data_type, UniqueDimFunctor( context, *x, out, axis, return_index, return_inverse, return_counts)); diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake index cd8b31d72e72adba6232b703e9d2513c90e46cdf..8262273b7ca7da47dc47a2e7a02fa1f40b9d4727 100644 --- a/paddle/fluid/operators/unity_build_rule.cmake +++ b/paddle/fluid/operators/unity_build_rule.cmake @@ -30,6 +30,7 @@ register_unity_group(cc bmm_op.cc bpr_loss_op.cc cast_op.cc + mkldnn/cast_mkldnn_op.cc cholesky_op.cc chunk_eval_op.cc clip_by_norm_op.cc @@ -234,6 +235,7 @@ register_unity_group(cc save_combine_op.cc save_op.cc scale_op.cc + mkldnn/scale_mkldnn_op.cc scatter_nd_add_op.cc scatter_op.cc seed_op.cc diff --git a/paddle/fluid/operators/unstack_op_npu.cc b/paddle/fluid/operators/unstack_op_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..eaab4ee999de73370099a38ec41fde81b6afe1d8 --- /dev/null +++ b/paddle/fluid/operators/unstack_op_npu.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unstack_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class UnStackNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *dy = ctx.Input("X"); + auto dx = ctx.MultiOutput("Y"); + int axis = ctx.Attr("axis"); + if (axis < 0) axis += dy->dims().size(); + int num = dy->dims()[axis]; + + auto stream = + ctx.template device_context() + .stream(); + + std::vector dx_list; + for (int i = 0; i < num; i++) { + dx[i]->mutable_data(ctx.GetPlace()); + dx_list.push_back(*dx[i]); + } + + const auto &runner = + NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}}); + runner.Run(stream); + } +}; + +template +class UnStackGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto x = ctx.MultiInput(framework::GradVarName("Y")); + auto *y = ctx.Output(framework::GradVarName("X")); + int axis = ctx.Attr("axis"); + if (axis < 0) axis += (x[0]->dims().size() + 1); + int num = static_cast(x.size()); + + auto stream = + ctx.template device_context() + .stream(); + + std::vector x_list; + for (int i = 0; i < num; i++) { + x_list.push_back(*x[i]); + } + y->mutable_data(ctx.GetPlace()); + + const auto &runner = + NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + unstack, ops::UnStackNPUKernel, + ops::UnStackNPUKernel); + +REGISTER_OP_NPU_KERNEL( + unstack_grad, ops::UnStackGradNPUKernel, + ops::UnStackGradNPUKernel); diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu index bb968743585f7d3574d477ab54cf657ef2646873..b1cd172923ee6dc421cc09b27163422207ea099c 100644 --- a/paddle/fluid/operators/where_index_op.cu +++ b/paddle/fluid/operators/where_index_op.cu @@ -12,7 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/where_index_op.h" @@ -25,52 +33,124 @@ namespace operators { using CUDADeviceContext = paddle::platform::CUDADeviceContext; template -class CUDAWhereIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - auto* out = context.Output("Out"); - - // TODO(zhoukunsheng): Should optimize to ensure GPU is faster than CPU. - framework::Tensor cond_cpu; - framework::TensorCopy(*condition, platform::CPUPlace(), &cond_cpu); - - const T* cond_data = cond_cpu.data(); - int64_t numel = cond_cpu.numel(); - auto dims = cond_cpu.dims(); - int rank = dims.size(); - - thrust::host_vector h_true_index; - for (int64_t i = 0; i < numel; i++) { - if (static_cast(cond_data[i])) { - h_true_index.push_back(i); +__global__ void GetTrueNum(const T *cond_data, const int64_t numel, + int64_t *true_num_array) { + const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { + true_num_array[idx] = + static_cast(static_cast(cond_data[idx])); + } +} + +template +__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data, + const int64_t numel, const int64_t *stride_array, + const int64_t rank, + const int64_t *true_num_array) { + const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { + // true_num_array is calculated by cub::InclusiveSum, + // cause the first element of true_num_array is 1, + // so we need substract 1 to get true index. + const int64_t true_index = true_num_array[idx] - 1; + if (static_cast(cond_data[idx])) { + int64_t rank_index = idx; + for (int j = 0; j < rank; j++) { + const int64_t out_index = rank_index / stride_array[j]; + out_ptr[true_index * rank + j] = out_index; + rank_index -= out_index * stride_array[j]; } } - thrust::device_vector d_true_index = h_true_index; - int64_t* ptr_true_index = thrust::raw_pointer_cast(d_true_index.data()); - - size_t true_num = h_true_index.size(); + } +} +template +class CUDAWhereIndexKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *condition = context.Input("Condition"); + auto *out = context.Output("Out"); + auto &dev_ctx = context.template device_context(); + + const T *cond_data = condition->data(); + const int64_t numel = condition->numel(); + auto dims = condition->dims(); + const int rank = dims.size(); + + auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t)); + auto h_array_mem = + memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t)); + + // "stride_array" is an array and len(stride_array)==rank, + // each element is the stride of each dimension -- the length from i to i+1. + int64_t *h_stride_array = reinterpret_cast(h_array_mem->ptr()); + int64_t *d_stride_array = reinterpret_cast(d_array_mem->ptr()); + + // "true_num_array" is an array and len(stride_array)==numel, + // at the beginning, + // "true_num_array" will set 1 if condition[i] == true else 0, + // then it will be calculated by cub::InclusiveSum, + // so that we can get the true number before i as the out index + int64_t *d_true_num_array = d_stride_array + rank; + + // the total_true_num is the total number of condition[i] == true + int64_t *h_total_true_num = h_stride_array + rank; + + // alloce cub memory + size_t cub_size = 0; + cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array, + d_true_num_array, numel, dev_ctx.stream()); + auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t)); + void *cub_data = cub_mem->ptr(); + + // set d_true_num_array[i]=1 if cond_data[i]==true else 0 + const int threads = std::min(numel, static_cast(128)); + const int64_t need_grids = (numel + threads - 1) / threads; + const int grids = std::min(need_grids, static_cast(256)); + GetTrueNum<<>>(cond_data, numel, + d_true_num_array); + + // calculate the inclusive prefix sum of "true_num_array" + // to get the index of "out" tensor, + // and the total number of cond_data[i]==true. + // Example: + // condition: F T T F F F T T + // before: 0 1 1 0 0 0 1 1 + // after: 0 1 2 2 2 2 3 4 + // out: 1 2 6 7 + cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array, + d_true_num_array, numel, dev_ctx.stream()); + + // calculate each dimension's stride + h_stride_array[rank - 1] = 1; + for (int i = rank - 2; i >= 0; i--) { + h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1]; + } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_stride_array, platform::CPUPlace(), h_stride_array, + rank * sizeof(int64_t), dev_ctx.stream()); + + // get total ture number and set output size + // the last element of cub::InclusiveSum is the total number + memory::Copy(platform::CPUPlace(), h_total_true_num, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_true_num_array + numel - 1, sizeof(int64_t), + dev_ctx.stream()); + dev_ctx.Wait(); + + int64_t true_num = *h_total_true_num; out->Resize(framework::make_ddim({static_cast(true_num), rank})); - auto out_ptr = out->mutable_data(context.GetPlace()); + auto out_data = out->mutable_data(context.GetPlace()); if (true_num == 0) { return; } - thrust::host_vector h_stride(rank, 0); - h_stride[rank - 1] = 1; - for (int i = rank - 2; i >= 0; i--) { - h_stride[i] = h_stride[i + 1] * dims[i + 1]; - } - thrust::device_vector d_stride = h_stride; - int64_t* ptr_stride = thrust::raw_pointer_cast(d_stride.data()); - - auto& dev_ctx = context.template device_context(); - WhereIndexFunctor functor(ptr_true_index, true_num, ptr_stride, - rank, out_ptr); - platform::ForRange for_range(dev_ctx, true_num); - for_range(functor); + // using true_num_array and stride_array to calculate the output index + SetTrueIndex<<>>( + out_data, cond_data, numel, d_stride_array, rank, d_true_num_array); } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 0827d6a5ae7644579ffc2ab502893ec1e6ab1ee2..36a956762174e18ed7eef1d6e1158b82bf3ceeae 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,7 +1,7 @@ proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto simple_threadpool) proto_library(error_codes_proto SRCS error_codes.proto) if(WITH_GPU) - proto_library(cuda_error_proto SRCS cuda_error.proto) + proto_library(external_error_proto SRCS external_error.proto) endif(WITH_GPU) if(WITH_XPU) @@ -45,7 +45,7 @@ cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) set(enforce_deps flags errors boost) if(WITH_GPU) - set(enforce_deps ${enforce_deps} cuda_error_proto) + set(enforce_deps ${enforce_deps} external_error_proto) endif() cc_library(enforce INTERFACE SRCS enforce.cc DEPS ${enforce_deps}) cc_library(monitor SRCS monitor.cc) @@ -187,10 +187,12 @@ endif() cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) +cc_test(complex_test SRCS complex_test.cc DEPS lod_tensor) IF(WITH_GPU) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor) + nv_test(complex_gpu_test SRCS complex_test.cu DEPS lod_tensor) nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) ENDIF() diff --git a/paddle/fluid/platform/complex.h b/paddle/fluid/platform/complex.h new file mode 100644 index 0000000000000000000000000000000000000000..2c1b42ea4882d563a5338256947339d3ab49aab4 --- /dev/null +++ b/paddle/fluid/platform/complex.h @@ -0,0 +1,537 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif // PADDLE_WITH_CUDA + +#ifdef PADDLE_WITH_HIP +#include +#include // NOLINT +#endif + +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif + +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// todo +#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX +#endif + +namespace paddle { +namespace platform { + +template +struct PADDLE_ALIGN(sizeof(T) * 2) complex { + public: + T real; + T imag; + + complex() = default; + complex(const complex& o) = default; + complex& operator=(const complex& o) = default; + complex(complex&& o) = default; + complex& operator=(complex&& o) = default; + ~complex() = default; + + HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {} + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + + template + HOSTDEVICE inline explicit complex(const thrust::complex& c) { + real = c.real(); + imag = c.imag(); + } + + template + HOSTDEVICE inline explicit operator thrust::complex() const { + return thrust::complex(real, imag); + } + +#ifdef PADDLE_WITH_HIP + HOSTDEVICE inline explicit operator hipFloatComplex() const { + return make_hipFloatComplex(real, imag); + } + + HOSTDEVICE inline explicit operator hipDoubleComplex() const { + return make_hipDoubleComplex(real, imag); + } +#else + HOSTDEVICE inline explicit operator cuFloatComplex() const { + return make_cuFloatComplex(real, imag); + } + + HOSTDEVICE inline explicit operator cuDoubleComplex() const { + return make_cuDoubleComplex(real, imag); + } +#endif +#endif + + template ::value || + std::is_integral::value, + int>::type = 0> + HOSTDEVICE complex(const T1& val) { + real = static_cast(val); + imag = static_cast(0.0); + } + + template + HOSTDEVICE explicit complex( + const std::enable_if_t::value, complex>& + val) { + real = val.real; + imag = val.imag; + } + + template + HOSTDEVICE explicit complex( + const std::enable_if_t::value, complex>& + val) { + real = val.real; + imag = val.imag; + } + + template + HOSTDEVICE inline explicit operator std::complex() const { + return static_cast>(std::complex(real, imag)); + } + + template + HOSTDEVICE complex(const std::complex& val) + : real(val.real()), imag(val.imag()) {} + + template ::value || + std::is_integral::value, + int>::type = 0> + HOSTDEVICE inline complex& operator=(const T1& val) { + real = static_cast(val); + imag = static_cast(0.0); + return *this; + } + + HOSTDEVICE inline explicit operator bool() const { + return static_cast(this->real) || static_cast(this->imag); + } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator float() const { + return static_cast(this->real); + } + + HOSTDEVICE inline explicit operator double() const { + return static_cast(this->real); + } +}; + +template +HOSTDEVICE inline complex operator+(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) + thrust::complex(b)); +#else + return complex(a.real + b.real, a.imag + b.imag); +#endif +} + +template +HOSTDEVICE inline complex operator-(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) - thrust::complex(b)); +#else + return complex(a.real - b.real, a.imag - b.imag); +#endif +} + +template +HOSTDEVICE inline complex operator*(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) * thrust::complex(b)); +#else + return complex(a.real * b.real - a.imag * b.imag, + a.imag * b.real + b.imag * a.real); +#endif +} + +template +HOSTDEVICE inline complex operator/(const complex& a, + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::complex(a) / thrust::complex(b)); +#else + T denominator = b.real * b.real + b.imag * b.imag; + return complex((a.real * b.real + a.imag * b.imag) / denominator, + (a.imag * b.real - a.real * b.imag) / denominator); +#endif +} + +template +HOSTDEVICE inline complex operator-(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(-thrust::complex(a.real, a.imag)); +#else + complex res; + res.real = -a.real; + res.imag = -a.imag; + return res; +#endif +} + +template +HOSTDEVICE inline complex& operator+=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) += + thrust::complex(b.real, b.imag)); + return a; +#else + a.real += b.real; + a.imag += b.imag; + return a; +#endif +} + +template +HOSTDEVICE inline complex& operator-=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) -= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real -= b.real; + a.imag -= b.imag; + return a; +#endif +} + +template +HOSTDEVICE inline complex& operator*=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) *= + thrust::complex(b.real, b.imag)); + return a; +#else + a.real = a.real * b.real - a.imag * b.imag; + a.imag = a.imag * b.real + b.imag * a.real; + return a; +#endif +} + +template +HOSTDEVICE inline complex& operator/=(complex& a, // NOLINT + const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + a = complex(thrust::complex(a.real, a.imag) /= + thrust::complex(b.real, b.imag)); + return a; +#else + T denominator = b.real * b.real + b.imag * b.imag; + a.real = (a.real * b.real + a.imag * b.imag) / denominator; + a.imag = (a.imag * b.real - a.real * b.imag) / denominator; + return a; +#endif +} + +template +HOSTDEVICE inline complex raw_uint16_to_complex64(uint16_t a) { + complex res; + res.real = a; + res.imag = 0.0; + return res; +} + +template +HOSTDEVICE inline bool operator==(const complex& a, const complex& b) { + return a.real == b.real && a.imag == b.imag; +} + +template +HOSTDEVICE inline bool operator!=(const complex& a, const complex& b) { + return a.real != b.real || a.imag != b.imag; +} + +template +HOSTDEVICE inline bool operator<(const complex& a, const complex& b) { + return a.real < b.real; +} + +template +HOSTDEVICE inline bool operator<=(const complex& a, const complex& b) { + return a.real <= b.real; +} + +template +HOSTDEVICE inline bool operator>(const complex& a, const complex& b) { + return a.real > b.real; +} + +template +HOSTDEVICE inline bool operator>=(const complex& a, const complex& b) { + return a.real >= b.real; +} + +template +HOSTDEVICE inline complex max(const complex& a, const complex& b) { + return (a.real >= b.real) ? a : b; +} + +template +HOSTDEVICE inline complex min(const complex& a, const complex& b) { + return (a.real < b.real) ? a : b; +} + +template +HOSTDEVICE inline bool(isnan)(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return ::isnan(a.real) || ::isnan(a.imag); +#else + return std::isnan(a.real) || std::isnan(a.imag); +#endif +} + +template +HOSTDEVICE inline bool isinf(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return ::isinf(a.real) || ::isinf(a.imag); +#else + return std::isinf(a.real) || std::isinf(a.imag); +#endif +} + +template +HOSTDEVICE inline bool isfinite(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return ::isfinite(a.real) || ::isfinite(a.imag); +#else + return std::isfinite(a.real) || std::isfinite(a.imag); +#endif +} + +template +HOSTDEVICE inline T abs(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return thrust::abs(thrust::complex(a)); +#else + return std::abs(std::complex(a)); +#endif +} + +template +HOSTDEVICE inline complex pow(const complex& a, const complex& b) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::pow(thrust::complex(a), thrust::complex(b))); +#else + return complex(std::pow(std::complex(a), std::complex(b))); +#endif +} + +template +HOSTDEVICE inline complex sqrt(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::sqrt(thrust::complex(a))); +#else + return complex(std::sqrt(std::complex(a))); +#endif +} + +template +HOSTDEVICE inline complex tanh(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::tanh(thrust::complex(a))); +#else + return complex(std::tanh(std::complex(a))); +#endif +} + +template +HOSTDEVICE inline complex log(const complex& a) { +#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX) && \ + (defined(__CUDA_ARCH__) || defined(__HIPCC__)) + return complex(thrust::log(thrust::complex(a))); +#else + return complex(std::log(std::complex(a))); +#endif +} + +template +inline std::ostream& operator<<(std::ostream& os, const complex& a) { + os << "real:" << a.real << " imag:" << a.imag; + return os; +} + +} // namespace platform +} // namespace paddle + +namespace std { + +template +struct is_pod> { + static const bool value = true; +}; + +template +struct is_floating_point> + : std::integral_constant {}; + +template +struct is_signed> { + static const bool value = false; +}; + +template +struct is_unsigned> { + static const bool value = false; +}; + +template +inline bool isnan(const paddle::platform::complex& a) { + return paddle::platform::isnan(a); +} + +template +inline bool isinf(const paddle::platform::complex& a) { + return paddle::platform::isinf(a); +} + +template +struct numeric_limits> { + static const bool is_specialized = false; + static const bool is_signed = false; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = false; + static const bool has_quiet_NaN = false; + static const bool has_signaling_NaN = false; + static const float_denorm_style has_denorm = denorm_absent; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_toward_zero; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 0; + static const int digits10 = 0; + static const int max_digits10 = 0; + static const int radix = 0; + static const int min_exponent = 0; + static const int min_exponent10 = 0; + static const int max_exponent = 0; + static const int max_exponent10 = 0; + static const bool traps = false; + static const bool tinyness_before = false; + + static paddle::platform::complex min() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex lowest() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex max() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex epsilon() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex round_error() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex infinity() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex quiet_NaN() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex signaling_NaN() { + return paddle::platform::complex(0.0, 0.0); + } + static paddle::platform::complex denorm_min() { + return paddle::platform::complex(0.0, 0.0); + } +}; + +} // namespace std diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h deleted file mode 100644 index da2f83c3497cce7b162336360690e1e76bce8b19..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/complex128.h +++ /dev/null @@ -1,535 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include -#include -#include -#include - -#ifdef PADDLE_WITH_CUDA -#include -#include -#endif // PADDLE_WITH_CUDA - -#ifdef PADDLE_WITH_HIP -#include -#include // NOLINT -#endif - -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif - -#if (defined(__CUDACC__) || defined(__HIPCC__)) -#define HOSTDEVICE __host__ __device__ -#define DEVICE __device__ -#define HOST __host__ -#else -#define HOSTDEVICE -#define DEVICE -#define HOST -#endif - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX128 -#endif - -namespace paddle { -namespace platform { - -struct PADDLE_ALIGN(16) complex128 { - public: - double real; - double imag; - - complex128() = default; - complex128(const complex128& o) = default; - complex128& operator=(const complex128& o) = default; - complex128(complex128&& o) = default; - complex128& operator=(complex128&& o) = default; - ~complex128() = default; - - HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - - HOSTDEVICE inline explicit complex128(const thrust::complex& c) { - real = c.real(); - imag = c.imag(); - } - - HOSTDEVICE inline explicit operator thrust::complex() const { - return thrust::complex(real, imag); - } - -#ifdef PADDLE_WITH_HIP - HOSTDEVICE inline explicit operator hipDoubleComplex() const { - return make_hipDoubleComplex(real, imag); - } -#else - HOSTDEVICE inline explicit operator cuDoubleComplex() const { - return make_cuDoubleComplex(real, imag); - } -#endif -#endif - - HOSTDEVICE complex128(const float& val) - : real(static_cast(val)), imag(0) {} - HOSTDEVICE complex128(const double& val) : real(val), imag(0) {} - HOSTDEVICE complex128(const int& val) - : real(static_cast(val)), imag(0) {} - HOSTDEVICE complex128(const int64_t& val) - : real(static_cast(val)), imag(0) {} - - HOSTDEVICE inline explicit operator std::complex() { - return static_cast>(std::complex(real, imag)); - } - - template - HOSTDEVICE inline explicit complex128(const T& val) - : real(complex128(static_cast(val)).real) {} - - HOSTDEVICE complex128(const std::complex val) - : real(val.real()), imag(val.imag()) {} - - HOSTDEVICE inline complex128& operator=(bool b) { - real = b ? 1 : 0; - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(int8_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(uint8_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(int16_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(uint16_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(int32_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(uint32_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(int64_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(uint64_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(float val) { - real = val; - imag = 0; - return *this; - } - - HOSTDEVICE inline complex128& operator=(double val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline operator float() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator bool() const { - return static_cast(this->real) || static_cast(this->imag); - } - - HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator double() const { - return static_cast(this->real); - } -}; - -HOSTDEVICE inline complex128 operator+(const complex128& a, - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::complex(a.real, a.imag) + - thrust::complex(b.real, b.imag)); -#else - return complex128(a.real + b.real, a.imag + b.imag); -#endif -} - -HOSTDEVICE inline complex128 operator-(const complex128& a, - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::complex(a.real, a.imag) - - thrust::complex(b.real, b.imag)); -#else - return complex128(a.real - b.real, a.imag - b.imag); -#endif -} - -HOSTDEVICE inline complex128 operator*(const complex128& a, - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::complex(a.real, a.imag) * - thrust::complex(b.real, b.imag)); -#else - return complex128(a.real * b.real - a.imag * b.imag, - a.imag * b.real + b.imag * a.real); -#endif -} - -HOSTDEVICE inline complex128 operator/(const complex128& a, - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::complex(a.real, a.imag) / - thrust::complex(b.real, b.imag)); -#else - double denominator = b.real * b.real + b.imag * b.imag; - return complex128((a.real * b.real + a.imag * b.imag) / denominator, - (a.imag * b.real - a.real * b.imag) / denominator); -#endif -} - -HOSTDEVICE inline complex128 operator-(const complex128& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(-thrust::complex(a.real, a.imag)); -#else - complex128 res; - res.real = -a.real; - res.imag = -a.imag; - return res; -#endif -} - -HOSTDEVICE inline complex128& operator+=(complex128& a, // NOLINT - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex128(thrust::complex(a.real, a.imag) += - thrust::complex(b.real, b.imag)); - return a; -#else - a.real += b.real; - a.imag += b.imag; - return a; -#endif -} - -HOSTDEVICE inline complex128& operator-=(complex128& a, // NOLINT - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex128(thrust::complex(a.real, a.imag) -= - thrust::complex(b.real, b.imag)); - return a; -#else - a.real -= b.real; - a.imag -= b.imag; - return a; -#endif -} - -HOSTDEVICE inline complex128& operator*=(complex128& a, // NOLINT - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex128(thrust::complex(a.real, a.imag) *= - thrust::complex(b.real, b.imag)); - return a; -#else - a.real = a.real * b.real - a.imag * b.imag; - a.imag = a.imag * b.real + b.imag * a.real; - return a; -#endif -} - -HOSTDEVICE inline complex128& operator/=(complex128& a, // NOLINT - const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex128(thrust::complex(a.real, a.imag) /= - thrust::complex(b.real, b.imag)); - return a; -#else - double denominator = b.real * b.real + b.imag * b.imag; - a.real = (a.real * b.real + a.imag * b.imag) / denominator; - a.imag = (a.imag * b.real - a.real * b.imag) / denominator; - return a; -#endif -} - -HOSTDEVICE inline complex128 raw_uint16_to_complex128(uint16_t a) { - complex128 res; - res.real = a; - return res; -} - -HOSTDEVICE inline bool operator==(const complex128& a, const complex128& b) { - return a.real == b.real && a.imag == b.imag; -} - -HOSTDEVICE inline bool operator!=(const complex128& a, const complex128& b) { - return a.real != b.real || a.imag != b.imag; -} - -HOSTDEVICE inline bool operator<(const complex128& a, const complex128& b) { - return static_cast(a.real) < static_cast(b.real); -} - -HOSTDEVICE inline bool operator<=(const complex128& a, const complex128& b) { - return static_cast(a.real) <= static_cast(b.real); -} - -HOSTDEVICE inline bool operator>(const complex128& a, const complex128& b) { - return static_cast(a.real) > static_cast(b.real); -} - -HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) { - return static_cast(a.real) >= static_cast(b.real); -} - -HOSTDEVICE inline bool(isnan)(const complex128& a) { -#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__) - // __isnanf not supported on HIP platform - return __isnan(a.real) || __isnan(a.imag); -#else - return std::isnan(a.real) || std::isnan(a.imag); -#endif -} - -HOSTDEVICE inline bool(isinf)(const complex128& a) { -#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__) - // __isinf not supported on HIP platform - return __isinf(a.real) || __isinf(a.imag); -#else - return std::isinf(a.real) || std::isinf(a.imag); -#endif -} - -HOSTDEVICE inline bool(isfinite)(const complex128& a) { - return !((isnan)(a)) && !((isinf)(a)); -} - -HOSTDEVICE inline double(abs)(const complex128& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return thrust::abs(thrust::complex(a.real, a.imag)); -#else - return std::abs(std::complex(a.real, a.imag)); -#endif -} - -HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::pow(thrust::complex(a.real, a.imag), - thrust::complex(b.real, b.imag))); -#else - return std::pow(std::complex(a), std::complex(b)); -#endif -} - -HOSTDEVICE inline complex128(sqrt)(const complex128& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::sqrt(thrust::complex(a.real, a.imag))); -#else - return std::sqrt(std::complex(a)); -#endif -} - -HOSTDEVICE inline complex128(tanh)(const complex128& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::tanh(thrust::complex(a.real, a.imag))); -#else - return std::tanh(std::complex(a)); -#endif -} - -HOSTDEVICE inline complex128(log)(const complex128& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX128) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex128(thrust::log(thrust::complex(a.real, a.imag))); -#else - return complex128(std::log(std::complex(a))); -#endif -} - -inline std::ostream& operator<<(std::ostream& os, const complex128& a) { - os << "real:" << a.real << " imag:" << a.imag; - return os; -} - -} // namespace platform -} // namespace paddle - -namespace std { - -template <> -struct is_pod { - static const bool value = - is_trivial::value && - is_standard_layout::value; -}; - -template <> -struct is_floating_point - : std::integral_constant< - bool, std::is_same::type>::value> { -}; -template <> -struct is_signed { - static const bool value = false; -}; - -template <> -struct is_unsigned { - static const bool value = false; -}; - -inline bool isnan(const paddle::platform::complex128& a) { - return paddle::platform::isnan(a); -} - -inline bool isinf(const paddle::platform::complex128& a) { - return paddle::platform::isinf(a); -} - -template <> -struct numeric_limits { - static const bool is_specialized = false; - static const bool is_signed = false; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = false; - static const bool has_quiet_NaN = false; - static const bool has_signaling_NaN = false; - static const float_denorm_style has_denorm = denorm_absent; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_toward_zero; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 0; - static const int digits10 = 0; - static const int max_digits10 = 0; - static const int radix = 0; - static const int min_exponent = 0; - static const int min_exponent10 = 0; - static const int max_exponent = 0; - static const int max_exponent10 = 0; - static const bool traps = false; - static const bool tinyness_before = false; - - static paddle::platform::complex128(min)() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128 lowest() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128(max)() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128 epsilon() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128 round_error() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128 infinity() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128 quiet_NaN() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128 signaling_NaN() { - return paddle::platform::complex128(0.0, 0.0); - } - static paddle::platform::complex128 denorm_min() { - return paddle::platform::complex128(0.0, 0.0); - } -}; - -} // namespace std - -#define MKL_Complex16 paddle::platform::complex128 diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h deleted file mode 100644 index 0aad7bd9dd2a8f1d59833720b442e34afa176ca6..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/complex64.h +++ /dev/null @@ -1,538 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include -#include -#include -#include - -#ifdef PADDLE_WITH_CUDA -#include -#include -#endif // PADDLE_WITH_CUDA - -#ifdef PADDLE_WITH_HIP -#include -#include // NOLINT -#endif - -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif - -#if (defined(__CUDACC__) || defined(__HIPCC__)) -#define HOSTDEVICE __host__ __device__ -#define DEVICE __device__ -#define HOST __host__ -#else -#define HOSTDEVICE -#define DEVICE -#define HOST -#endif - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#define PADDLE_WITH_CUDA_OR_HIP_COMPLEX64 -#endif - -#include "complex128.h" // NOLINT - -namespace paddle { -namespace platform { - -struct PADDLE_ALIGN(8) complex64 { - public: - float real; - float imag; - - complex64() = default; - complex64(const complex64& o) = default; - complex64& operator=(const complex64& o) = default; - complex64(complex64&& o) = default; - complex64& operator=(complex64&& o) = default; - ~complex64() = default; - - HOSTDEVICE complex64(float real, float imag) : real(real), imag(imag) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - - HOSTDEVICE inline explicit complex64(const thrust::complex& c) { - real = c.real(); - imag = c.imag(); - } - - HOSTDEVICE inline explicit operator thrust::complex() const { - return thrust::complex(real, imag); - } - -#ifdef PADDLE_WITH_HIP - HOSTDEVICE inline explicit operator hipFloatComplex() const { - return make_hipFloatComplex(real, imag); - } -#else - HOSTDEVICE inline explicit operator cuFloatComplex() const { - return make_cuFloatComplex(real, imag); - } -#endif -#endif - - HOSTDEVICE complex64(const float& val) : real(val), imag(0) {} - HOSTDEVICE complex64(const double& val) - : real(static_cast(val)), imag(0) {} - HOSTDEVICE complex64(const int& val) - : real(static_cast(val)), imag(0) {} - HOSTDEVICE complex64(const int64_t& val) - : real(static_cast(val)), imag(0) {} - HOSTDEVICE complex64(const complex128& val) - : real(static_cast(val.real)), - imag(static_cast(val.imag)) {} - - HOSTDEVICE inline explicit operator std::complex() { - return static_cast>(std::complex(real, imag)); - } - - template - HOSTDEVICE inline explicit complex64(const T& val) - : real(complex64(static_cast(val)).real) {} - - HOSTDEVICE complex64(const std::complex val) - : real(val.real()), imag(val.imag()) {} - - HOSTDEVICE inline complex64& operator=(bool b) { - real = b ? 1 : 0; - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(int8_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(uint8_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(int16_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(uint16_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(int32_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(uint32_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(int64_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(uint64_t val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(float val) { - real = val; - imag = 0; - return *this; - } - - HOSTDEVICE inline complex64& operator=(double val) { - real = static_cast(val); - imag = 0; - return *this; - } - - HOSTDEVICE inline operator float() const { return this->real; } - - HOSTDEVICE inline explicit operator bool() const { - return static_cast(this->real) || static_cast(this->imag); - } - - HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(this->real); - } - - HOSTDEVICE inline explicit operator double() const { - return static_cast(this->real); - } - - HOSTDEVICE inline operator complex128() const { - return complex128(static_cast(this->real), - static_cast(this->imag)); - } -}; - -HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::complex(a.real, a.imag) + - thrust::complex(b.real, b.imag)); -#else - return complex64(a.real + b.real, a.imag + b.imag); -#endif -} - -HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::complex(a.real, a.imag) - - thrust::complex(b.real, b.imag)); -#else - return complex64(a.real - b.real, a.imag - b.imag); -#endif -} - -HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::complex(a.real, a.imag) * - thrust::complex(b.real, b.imag)); -#else - return complex64(a.real * b.real - a.imag * b.imag, - a.imag * b.real + b.imag * a.real); -#endif -} - -HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::complex(a.real, a.imag) / - thrust::complex(b.real, b.imag)); -#else - float denominator = b.real * b.real + b.imag * b.imag; - return complex64((a.real * b.real + a.imag * b.imag) / denominator, - (a.imag * b.real - a.real * b.imag) / denominator); -#endif -} - -HOSTDEVICE inline complex64 operator-(const complex64& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(-thrust::complex(a.real, a.imag)); -#else - complex64 res; - res.real = -a.real; - res.imag = -a.imag; - return res; -#endif -} - -HOSTDEVICE inline complex64& operator+=(complex64& a, // NOLINT - const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex64(thrust::complex(a.real, a.imag) += - thrust::complex(b.real, b.imag)); - return a; -#else - a.real += b.real; - a.imag += b.imag; - return a; -#endif -} - -HOSTDEVICE inline complex64& operator-=(complex64& a, // NOLINT - const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex64(thrust::complex(a.real, a.imag) -= - thrust::complex(b.real, b.imag)); - return a; -#else - a.real -= b.real; - a.imag -= b.imag; - return a; -#endif -} - -HOSTDEVICE inline complex64& operator*=(complex64& a, // NOLINT - const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex64(thrust::complex(a.real, a.imag) *= - thrust::complex(b.real, b.imag)); - return a; -#else - a.real = a.real * b.real - a.imag * b.imag; - a.imag = a.imag * b.real + b.imag * a.real; - return a; -#endif -} - -HOSTDEVICE inline complex64& operator/=(complex64& a, // NOLINT - const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - a = complex64(thrust::complex(a.real, a.imag) /= - thrust::complex(b.real, b.imag)); - return a; -#else - float denominator = b.real * b.real + b.imag * b.imag; - a.real = (a.real * b.real + a.imag * b.imag) / denominator; - a.imag = (a.imag * b.real - a.real * b.imag) / denominator; - return a; -#endif -} - -HOSTDEVICE inline complex64 raw_uint16_to_complex64(uint16_t a) { - complex64 res; - res.real = a; - return res; -} - -HOSTDEVICE inline bool operator==(const complex64& a, const complex64& b) { - return a.real == b.real && a.imag == b.imag; -} - -HOSTDEVICE inline bool operator!=(const complex64& a, const complex64& b) { - return a.real != b.real || a.imag != b.imag; -} - -HOSTDEVICE inline bool operator<(const complex64& a, const complex64& b) { - return static_cast(a.real) < static_cast(b.real); -} - -HOSTDEVICE inline bool operator<=(const complex64& a, const complex64& b) { - return static_cast(a.real) <= static_cast(b.real); -} - -HOSTDEVICE inline bool operator>(const complex64& a, const complex64& b) { - return static_cast(a.real) > static_cast(b.real); -} - -HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) { - return static_cast(a.real) >= static_cast(b.real); -} - -HOSTDEVICE inline bool(isnan)(const complex64& a) { -#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__) - // __isnanf not supported on HIP platform - return __isnanf(a.real) || __isnanf(a.imag); -#else - return std::isnan(a.real) || std::isnan(a.imag); -#endif -} - -HOSTDEVICE inline bool(isinf)(const complex64& a) { -#if defined(PADDLE_WITH_CUDA) && defined(__CUDA_ARCH__) - // __isinff not supported on HIP platform - return __isinff(a.real) || __isinff(a.imag); -#else - return std::isinf(a.real) || std::isinf(a.imag); -#endif -} - -HOSTDEVICE inline bool(isfinite)(const complex64& a) { - return !((isnan)(a)) && !((isinf)(a)); -} - -HOSTDEVICE inline float(abs)(const complex64& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::abs(thrust::complex(a.real, a.imag))); -#else - return std::abs(std::complex(a.real, a.imag)); -#endif -} - -HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::pow(thrust::complex(a.real, a.imag), - thrust::complex(b.real, b.imag))); -#else - return std::pow(std::complex(a), std::complex(b)); -#endif -} - -HOSTDEVICE inline complex64(sqrt)(const complex64& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::sqrt(thrust::complex(a.real, a.imag))); -#else - return std::sqrt(std::complex(a)); -#endif -} - -HOSTDEVICE inline complex64(tanh)(const complex64& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::tanh(thrust::complex(a.real, a.imag))); -#else - return std::tanh(std::complex(a)); -#endif -} - -HOSTDEVICE inline complex64(log)(const complex64& a) { -#if defined(PADDLE_WITH_CUDA_OR_HIP_COMPLEX64) && \ - (defined(__CUDA_ARCH__) || defined(__HIPCC__)) - return complex64(thrust::log(thrust::complex(a.real, a.imag))); -#else - return std::log(std::complex(a)); -#endif -} - -inline std::ostream& operator<<(std::ostream& os, const complex64& a) { - os << "real:" << a.real << " imag:" << a.imag; - return os; -} - -} // namespace platform -} // namespace paddle - -namespace std { - -template <> -struct is_pod { - static const bool value = - is_trivial::value && - is_standard_layout::value; -}; - -template <> -struct is_floating_point - : std::integral_constant< - bool, std::is_same::type>::value> {}; -template <> -struct is_signed { - static const bool value = false; -}; - -template <> -struct is_unsigned { - static const bool value = false; -}; - -inline bool isnan(const paddle::platform::complex64& a) { - return paddle::platform::isnan(a); -} - -inline bool isinf(const paddle::platform::complex64& a) { - return paddle::platform::isinf(a); -} - -template <> -struct numeric_limits { - static const bool is_specialized = false; - static const bool is_signed = false; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = false; - static const bool has_quiet_NaN = false; - static const bool has_signaling_NaN = false; - static const float_denorm_style has_denorm = denorm_absent; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_toward_zero; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 0; - static const int digits10 = 0; - static const int max_digits10 = 0; - static const int radix = 0; - static const int min_exponent = 0; - static const int min_exponent10 = 0; - static const int max_exponent = 0; - static const int max_exponent10 = 0; - static const bool traps = false; - static const bool tinyness_before = false; - - static paddle::platform::complex64(min)() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64 lowest() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64(max)() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64 epsilon() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64 round_error() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64 infinity() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64 quiet_NaN() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64 signaling_NaN() { - return paddle::platform::complex64(0.0, 0.0); - } - static paddle::platform::complex64 denorm_min() { - return paddle::platform::complex64(0.0, 0.0); - } -}; - -} // namespace std - -#define MKL_Complex8 paddle::platform::complex64 diff --git a/paddle/fluid/platform/complex_test.cc b/paddle/fluid/platform/complex_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4d13161e94faf910829fd93543e6c18990ea7813 --- /dev/null +++ b/paddle/fluid/platform/complex_test.cc @@ -0,0 +1,324 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/complex.h" +#include +#include "paddle/fluid/platform/eigen_ext.h" + +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +TEST(complex, conversion_cpu) { + // *********** complex ************* + // float to complex + EXPECT_EQ(complex().real, 0.0f); + EXPECT_EQ(complex().imag, 0.0f); + + EXPECT_EQ(complex(1.0f, 1.0f).real, 1.0f); + EXPECT_EQ(complex(1.0f, 1.0f).imag, 1.0f); + EXPECT_EQ(complex(0.0f, 1.0f).real, 0.0f); + EXPECT_EQ(complex(0.0f, 1.0f).imag, 1.0f); + + EXPECT_EQ(complex(1.0f).real, 1.0f); + EXPECT_EQ(complex(1.0f).imag, 0.0f); + + // int to complex + EXPECT_EQ(complex(1).real, 1.0f); + EXPECT_EQ(complex(0).real, 0.0f); + EXPECT_EQ(complex(2).real, 2.0f); + EXPECT_EQ(complex(-2).real, -2.0f); + + // bool to complex + EXPECT_EQ(complex(true).real, 1.0f); + EXPECT_EQ(complex(true).imag, 0.0f); + + // complex to complex + EXPECT_EQ(complex(complex(1.0, 2.0)).real, 1.0f); + EXPECT_EQ(complex(complex(1.0, 2.0)).imag, 2.0f); + + // std::complex to complex + EXPECT_EQ(complex(std::complex(1.0f, 2.0f)).real, 1.0f); + EXPECT_EQ(complex(std::complex(1.0f, 2.0f)).imag, 2.0f); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).real, 1.0f); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).imag, 2.0f); + + // Assignment operator + complex c = 1.0f; + EXPECT_EQ(c.real, 1.0f); + EXPECT_EQ(c.imag, 0.0f); + c = complex(2.0, 2.0); + EXPECT_EQ(c.real, 2.0f); + EXPECT_EQ(c.imag, 2.0f); + + // Conversion operator + EXPECT_EQ(static_cast(complex(0.5f)), 0.5f); + EXPECT_NEAR(static_cast(complex(0.33333)), 0.33333, 0.01); + EXPECT_EQ(static_cast(complex(-1)), -1); + EXPECT_EQ(static_cast(complex(true)), true); + + // *********** complex ************* + // double to complex + EXPECT_EQ(complex().real, 0.0); + EXPECT_EQ(complex().imag, 0.0); + + EXPECT_EQ(complex(1.0, 1.0).real, 1.0); + EXPECT_EQ(complex(1.0, 1.0).imag, 1.0); + EXPECT_EQ(complex(0.0, 1.0).real, 0.0); + EXPECT_EQ(complex(0.0, 1.0).imag, 1.0); + + EXPECT_EQ(complex(1.0).real, 1.0); + EXPECT_EQ(complex(1.0).imag, 0.0); + + // int to complex + EXPECT_EQ(complex(1).real, 1.0); + EXPECT_EQ(complex(0).real, 0.0); + EXPECT_EQ(complex(2).real, 2.0); + EXPECT_EQ(complex(-2).real, -2.0); + + // bool to complex + EXPECT_EQ(complex(true).real, 1.0); + EXPECT_EQ(complex(true).imag, 0.0); + + // complex to complex + EXPECT_EQ(complex(complex(1.0f, 2.0f)).real, 1.0); + EXPECT_EQ(complex(complex(1.0f, 2.0f)).imag, 2.0); + + // std::complex to complex + EXPECT_EQ(complex(std::complex(1.0, 2.0)).real, 1.0); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).imag, 2.0); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).real, 1.0); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).imag, 2.0); + + // Assignment operator + complex c1 = 1.0; + EXPECT_EQ(c1.real, 1.0); + EXPECT_EQ(c1.imag, 0.0); + c1 = complex(2.0, 2.0); + EXPECT_EQ(c1.real, 2.0); + EXPECT_EQ(c1.imag, 2.0); + + // Conversion operator + EXPECT_EQ(static_cast(complex(0.5)), 0.5); + EXPECT_NEAR(static_cast(complex(0.33333)), 0.33333, 0.01); + EXPECT_EQ(static_cast(complex(-1)), -1); + EXPECT_EQ(static_cast(complex(true)), true); +} + +TEST(bfloat16, comparison_cpu) { + // *********** complex ************* + EXPECT_TRUE(complex(1.0f) == complex(1.0f)); + EXPECT_TRUE(complex(1.0f, 2.0f) == complex(1.0f, 2.0f)); + EXPECT_FALSE(complex(-1.0f) == complex(-0.5f)); + EXPECT_TRUE(complex(1.0f) != complex(0.5f)); + EXPECT_FALSE(complex(-1.0f) != complex(-1.0f)); + EXPECT_TRUE(complex(1.0f) < complex(2.0f)); + EXPECT_FALSE(complex(-1.0f) < complex(-1.0f)); + EXPECT_TRUE(complex(1.0f) <= complex(1.0f)); + EXPECT_TRUE(complex(2.0f) > complex(1.0f)); + EXPECT_FALSE(complex(-2.0f) > complex(-2.0f)); + EXPECT_TRUE(complex(2.0f) >= complex(2.0f)); + + // *********** complex ************* + EXPECT_TRUE(complex(1.0) == complex(1.0)); + EXPECT_TRUE(complex(1.0, 2.0) == complex(1.0, 2.0)); + EXPECT_FALSE(complex(-1.0) == complex(-0.5f)); + EXPECT_TRUE(complex(1.0) != complex(0.5f)); + EXPECT_FALSE(complex(-1.0) != complex(-1.0)); + EXPECT_TRUE(complex(1.0) < complex(2.0)); + EXPECT_FALSE(complex(-1.0) < complex(-1.0)); + EXPECT_TRUE(complex(1.0) <= complex(1.0)); + EXPECT_TRUE(complex(2.0) > complex(1.0)); + EXPECT_FALSE(complex(-2.0) > complex(-2.0)); + EXPECT_TRUE(complex(2.0) >= complex(2.0)); +} + +TEST(complex, arithmetic_cpu) { + // *********** complex ************* + complex a = complex(1, 1) + complex(1, 1); + EXPECT_NEAR(a.real, 2, 0.001); + EXPECT_NEAR(a.imag, 2, 0.001); + + complex b = complex(-5, -5) + complex(5, 5); + EXPECT_EQ(b.real, 0); + EXPECT_EQ(b.imag, 0); + + complex c = + complex(0.33333f, 0.33333f) + complex(0.66667f, 0.66667f); + EXPECT_NEAR(c.real, 1.0f, 0.01); + EXPECT_NEAR(c.imag, 1.0f, 0.01); + + complex d = complex(3) - complex(5); + EXPECT_EQ(d.real, -2); + EXPECT_EQ(d.imag, 0); + + complex e = + complex(0.66667f, 0.66667f) - complex(0.33333f, 0.33333f); + EXPECT_NEAR(e.real, 0.33334f, 0.01); + EXPECT_NEAR(e.imag, 0.33334f, 0.01); + + complex f = complex(0.33f, 0.33f) * complex(0.2f, 0.2f); + EXPECT_NEAR(f.real, 0.0f, 0.01); + EXPECT_NEAR(f.imag, 0.132f, 0.01); + + complex g = complex(0.33f, 0.33f) / complex(0.2f, 0.2f); + EXPECT_NEAR(g.real, 1.65f, 0.01); + EXPECT_NEAR(g.imag, 0.0f, 0.01); + + complex h = -complex(0.33f, 0.33f); + EXPECT_NEAR(h.real, -0.33f, 0.01); + EXPECT_NEAR(h.imag, -0.33f, 0.01); + h = -complex(-0.33f, -0.33f); + EXPECT_NEAR(h.real, 0.33f, 0.01); + EXPECT_NEAR(h.imag, 0.33f, 0.01); + + complex i = complex(1.0, 1.0); + i += complex(2.0, 2.0); + EXPECT_NEAR(i.real, 3.0f, 0.01); + EXPECT_NEAR(i.imag, 3.0f, 0.01); + i -= complex(1.0, 1.0); + EXPECT_NEAR(i.real, 2.0f, 0.01); + EXPECT_NEAR(i.imag, 2.0f, 0.01); + i *= complex(3, 2); + EXPECT_NEAR(i.real, 2.0f, 0.01); + EXPECT_NEAR(i.imag, 10.0f, 0.01); + i /= complex(3, 2); + EXPECT_NEAR(i.real, 2.0f, 0.01); + EXPECT_NEAR(i.imag, 2.0f, 0.01); + + // *********** complex ************* + complex a1 = complex(1, 1) + complex(1, 1); + EXPECT_NEAR(a1.real, 2, 0.001); + EXPECT_NEAR(a1.imag, 2, 0.001); + + complex b1 = complex(-5, -5) + complex(5, 5); + EXPECT_EQ(b1.real, 0); + EXPECT_EQ(b1.imag, 0); + + complex c1 = + complex(0.33333f, 0.33333f) + complex(0.66667f, 0.66667f); + EXPECT_NEAR(c1.real, 1.0f, 0.01); + EXPECT_NEAR(c1.imag, 1.0f, 0.01); + + complex d1 = complex(3) - complex(5); + EXPECT_EQ(d1.real, -2); + EXPECT_EQ(d1.imag, 0); + + complex e1 = + complex(0.66667f, 0.66667f) - complex(0.33333f, 0.33333f); + EXPECT_NEAR(e1.real, 0.33334f, 0.01); + EXPECT_NEAR(e1.imag, 0.33334f, 0.01); + + complex f1 = + complex(0.33f, 0.33f) * complex(0.2f, 0.2f); + EXPECT_NEAR(f1.real, 0.0f, 0.01); + EXPECT_NEAR(f1.imag, 0.132f, 0.01); + + complex g1 = + complex(0.33f, 0.33f) / complex(0.2f, 0.2f); + EXPECT_NEAR(g1.real, 1.65f, 0.01); + EXPECT_NEAR(g1.imag, 0.0f, 0.01); + + complex h1 = -complex(0.33f, 0.33f); + EXPECT_NEAR(h1.real, -0.33f, 0.01); + EXPECT_NEAR(h1.imag, -0.33f, 0.01); + h1 = -complex(-0.33f, -0.33f); + EXPECT_NEAR(h1.real, 0.33f, 0.01); + EXPECT_NEAR(h1.imag, 0.33f, 0.01); + + complex i1 = complex(1.0, 1.0); + i1 += complex(2.0, 2.0); + EXPECT_NEAR(i1.real, 3.0f, 0.01); + EXPECT_NEAR(i1.imag, 3.0f, 0.01); + i1 -= complex(1.0, 1.0); + EXPECT_NEAR(i1.real, 2.0f, 0.01); + EXPECT_NEAR(i1.imag, 2.0f, 0.01); + i1 *= complex(3, 2); + EXPECT_NEAR(i1.real, 2.0f, 0.01); + EXPECT_NEAR(i1.imag, 10.0f, 0.01); + i1 /= complex(3, 2); + EXPECT_NEAR(i1.real, 2.0f, 0.01); + EXPECT_NEAR(i1.imag, 2.0f, 0.01); +} + +TEST(complex, print) { + complex a(1.0f); + std::cout << a << std::endl; + + complex b(1.0); + std::cout << b << std::endl; +} + +TEST(complex, isinf) { + // *********** complex ************* + complex a; + a.real = float(INFINITY); + EXPECT_EQ(std::isinf(a), true); + a.imag = float(INFINITY); + EXPECT_EQ(std::isinf(a), true); + + complex b = float(INFINITY); + EXPECT_EQ(std::isinf(b), true); + + complex c(float(INFINITY), 0); + EXPECT_EQ(std::isinf(c), true); + + // *********** complex ************* + complex a1; + a1.real = double(INFINITY); + EXPECT_EQ(std::isinf(a1), true); + a1.imag = double(INFINITY); + EXPECT_EQ(std::isinf(a1), true); + + complex b1 = double(INFINITY); + EXPECT_EQ(std::isinf(b1), true); + + complex c1(double(INFINITY), 0); + EXPECT_EQ(std::isinf(c1), true); +} + +TEST(complex, isnan) { + // *********** complex ************* + complex a; + a.real = float(NAN); + EXPECT_EQ(std::isnan(a), true); + a.imag = float(NAN); + EXPECT_EQ(std::isnan(a), true); + + complex b = float(NAN); + EXPECT_EQ(std::isnan(b), true); + + complex c(float(NAN), 0); + EXPECT_EQ(std::isnan(c), true); + + // *********** complex ************* + complex a1; + a1.real = double(NAN); + EXPECT_EQ(std::isnan(a1), true); + a1.imag = double(NAN); + EXPECT_EQ(std::isnan(a1), true); + + complex b1 = double(NAN); + EXPECT_EQ(std::isnan(b1), true); + + complex c1(double(NAN), 0); + EXPECT_EQ(std::isnan(c1), true); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/complex_test.cu b/paddle/fluid/platform/complex_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..b46d1b7b271d78fd436682fa2a5ffae974e61326 --- /dev/null +++ b/paddle/fluid/platform/complex_test.cu @@ -0,0 +1,361 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/complex.h" + +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/eigen_ext.h" +#include "paddle/fluid/platform/enforce.h" + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +namespace paddle { +namespace platform { + +TEST(complex, conversion_on_gpu) { + // *********** complex ************* + // thrust from and to complex + complex a(1.0f, 2.0f); + EXPECT_EQ(complex(thrust::complex(a)).real, 1.0); + EXPECT_EQ(complex(thrust::complex(a)).imag, 2.0); + + complex a1(1.0, 2.0); + EXPECT_EQ(complex(thrust::complex(a1)).real, 1.0); + EXPECT_EQ(complex(thrust::complex(a1)).imag, 2.0); + +#if defined(PADDLE_WITH_HIP) + EXPECT_EQ(hipFloatComplex(a).real(), 1.0); + EXPECT_EQ(hipFloatComplex(a).imag(), 2.0); + EXPECT_EQ(hipDoubleComplex(a).real(), 1.0); + EXPECT_EQ(hipDoubleComplex(a).imag(), 2.0); + + EXPECT_EQ(hipFloatComplex(a1).real(), 1.0); + EXPECT_EQ(hipFloatComplex(a1).imag(), 2.0); + EXPECT_EQ(hipDoubleComplex(a1).real(), 1.0); + EXPECT_EQ(hipDoubleComplex(a1).imag(), 2.0); +#else + EXPECT_EQ(cuCrealf(cuFloatComplex(a)), 1.0); + EXPECT_EQ(cuCimagf(cuFloatComplex(a)), 2.0); + EXPECT_EQ(cuCreal(cuDoubleComplex(a)), 1.0); + EXPECT_EQ(cuCimag(cuDoubleComplex(a)), 2.0); + + EXPECT_EQ(cuCrealf(cuFloatComplex(a1)), 1.0); + EXPECT_EQ(cuCimagf(cuFloatComplex(a1)), 2.0); + EXPECT_EQ(cuCreal(cuDoubleComplex(a1)), 1.0); + EXPECT_EQ(cuCimag(cuDoubleComplex(a1)), 2.0); +#endif + + EXPECT_EQ(complex().real, 0.0f); + EXPECT_EQ(complex().imag, 0.0f); + + EXPECT_EQ(complex(1.0f, 1.0f).real, 1.0f); + EXPECT_EQ(complex(1.0f, 1.0f).imag, 1.0f); + EXPECT_EQ(complex(0.0f, 1.0f).real, 0.0f); + EXPECT_EQ(complex(0.0f, 1.0f).imag, 1.0f); + + EXPECT_EQ(complex(1.0f).real, 1.0f); + EXPECT_EQ(complex(1.0f).imag, 0.0f); + + // int to complex + EXPECT_EQ(complex(1).real, 1.0f); + EXPECT_EQ(complex(0).real, 0.0f); + EXPECT_EQ(complex(2).real, 2.0f); + EXPECT_EQ(complex(-2).real, -2.0f); + + // bool to complex + EXPECT_EQ(complex(true).real, 1.0f); + EXPECT_EQ(complex(true).imag, 0.0f); + + // complex to complex + EXPECT_EQ(complex(complex(1.0, 2.0)).real, 1.0f); + EXPECT_EQ(complex(complex(1.0, 2.0)).imag, 2.0f); + + // std::complex to complex + EXPECT_EQ(complex(std::complex(1.0f, 2.0f)).real, 1.0f); + EXPECT_EQ(complex(std::complex(1.0f, 2.0f)).imag, 2.0f); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).real, 1.0f); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).imag, 2.0f); + + // Assignment operator + complex c = 1.0f; + EXPECT_EQ(c.real, 1.0f); + EXPECT_EQ(c.imag, 0.0f); + c = complex(2.0, 2.0); + EXPECT_EQ(c.real, 2.0f); + EXPECT_EQ(c.imag, 2.0f); + + // Conversion operator + EXPECT_EQ(static_cast(complex(0.5f)), 0.5f); + EXPECT_NEAR(static_cast(complex(0.33333)), 0.33333, 0.01); + EXPECT_EQ(static_cast(complex(-1)), -1); + EXPECT_EQ(static_cast(complex(true)), true); + + // *********** complex ************* + // double to complex + EXPECT_EQ(complex().real, 0.0); + EXPECT_EQ(complex().imag, 0.0); + + EXPECT_EQ(complex(1.0, 1.0).real, 1.0); + EXPECT_EQ(complex(1.0, 1.0).imag, 1.0); + EXPECT_EQ(complex(0.0, 1.0).real, 0.0); + EXPECT_EQ(complex(0.0, 1.0).imag, 1.0); + + EXPECT_EQ(complex(1.0).real, 1.0); + EXPECT_EQ(complex(1.0).imag, 0.0); + + // int to complex + EXPECT_EQ(complex(1).real, 1.0); + EXPECT_EQ(complex(0).real, 0.0); + EXPECT_EQ(complex(2).real, 2.0); + EXPECT_EQ(complex(-2).real, -2.0); + + // bool to complex + EXPECT_EQ(complex(true).real, 1.0); + EXPECT_EQ(complex(true).imag, 0.0); + + // complex to complex + EXPECT_EQ(complex(complex(1.0f, 2.0f)).real, 1.0); + EXPECT_EQ(complex(complex(1.0f, 2.0f)).imag, 2.0); + + // std::complex to complex + EXPECT_EQ(complex(std::complex(1.0, 2.0)).real, 1.0); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).imag, 2.0); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).real, 1.0); + EXPECT_EQ(complex(std::complex(1.0, 2.0)).imag, 2.0); + + // Assignment operator + complex c1 = 1.0; + EXPECT_EQ(c1.real, 1.0); + EXPECT_EQ(c1.imag, 0.0); + c1 = complex(2.0, 2.0); + EXPECT_EQ(c1.real, 2.0); + EXPECT_EQ(c1.imag, 2.0); + + // Conversion operator + EXPECT_EQ(static_cast(complex(0.5)), 0.5); + EXPECT_NEAR(static_cast(complex(0.33333)), 0.33333, 0.01); + EXPECT_EQ(static_cast(complex(-1)), -1); + EXPECT_EQ(static_cast(complex(true)), true); +} + +TEST(bfloat16, comparison_cpu) { + // *********** complex ************* + EXPECT_TRUE(complex(1.0f) == complex(1.0f)); + EXPECT_TRUE(complex(1.0f, 2.0f) == complex(1.0f, 2.0f)); + EXPECT_FALSE(complex(-1.0f) == complex(-0.5f)); + EXPECT_TRUE(complex(1.0f) != complex(0.5f)); + EXPECT_FALSE(complex(-1.0f) != complex(-1.0f)); + EXPECT_TRUE(complex(1.0f) < complex(2.0f)); + EXPECT_FALSE(complex(-1.0f) < complex(-1.0f)); + EXPECT_TRUE(complex(1.0f) <= complex(1.0f)); + EXPECT_TRUE(complex(2.0f) > complex(1.0f)); + EXPECT_FALSE(complex(-2.0f) > complex(-2.0f)); + EXPECT_TRUE(complex(2.0f) >= complex(2.0f)); + + // *********** complex ************* + EXPECT_TRUE(complex(1.0) == complex(1.0)); + EXPECT_TRUE(complex(1.0, 2.0) == complex(1.0, 2.0)); + EXPECT_FALSE(complex(-1.0) == complex(-0.5f)); + EXPECT_TRUE(complex(1.0) != complex(0.5f)); + EXPECT_FALSE(complex(-1.0) != complex(-1.0)); + EXPECT_TRUE(complex(1.0) < complex(2.0)); + EXPECT_FALSE(complex(-1.0) < complex(-1.0)); + EXPECT_TRUE(complex(1.0) <= complex(1.0)); + EXPECT_TRUE(complex(2.0) > complex(1.0)); + EXPECT_FALSE(complex(-2.0) > complex(-2.0)); + EXPECT_TRUE(complex(2.0) >= complex(2.0)); +} + +TEST(complex, arithmetic_cpu) { + // *********** complex ************* + complex a = complex(1, 1) + complex(1, 1); + EXPECT_NEAR(a.real, 2, 0.001); + EXPECT_NEAR(a.imag, 2, 0.001); + + complex b = complex(-5, -5) + complex(5, 5); + EXPECT_EQ(b.real, 0); + EXPECT_EQ(b.imag, 0); + + complex c = + complex(0.33333f, 0.33333f) + complex(0.66667f, 0.66667f); + EXPECT_NEAR(c.real, 1.0f, 0.01); + EXPECT_NEAR(c.imag, 1.0f, 0.01); + + complex d = complex(3) - complex(5); + EXPECT_EQ(d.real, -2); + EXPECT_EQ(d.imag, 0); + + complex e = + complex(0.66667f, 0.66667f) - complex(0.33333f, 0.33333f); + EXPECT_NEAR(e.real, 0.33334f, 0.01); + EXPECT_NEAR(e.imag, 0.33334f, 0.01); + + complex f = complex(0.33f, 0.33f) * complex(0.2f, 0.2f); + EXPECT_NEAR(f.real, 0.0f, 0.01); + EXPECT_NEAR(f.imag, 0.132f, 0.01); + + complex g = complex(0.33f, 0.33f) / complex(0.2f, 0.2f); + EXPECT_NEAR(g.real, 1.65f, 0.01); + EXPECT_NEAR(g.imag, 0.0f, 0.01); + + complex h = -complex(0.33f, 0.33f); + EXPECT_NEAR(h.real, -0.33f, 0.01); + EXPECT_NEAR(h.imag, -0.33f, 0.01); + h = -complex(-0.33f, -0.33f); + EXPECT_NEAR(h.real, 0.33f, 0.01); + EXPECT_NEAR(h.imag, 0.33f, 0.01); + + complex i = complex(1.0, 1.0); + i += complex(2.0, 2.0); + EXPECT_NEAR(i.real, 3.0f, 0.01); + EXPECT_NEAR(i.imag, 3.0f, 0.01); + i -= complex(1.0, 1.0); + EXPECT_NEAR(i.real, 2.0f, 0.01); + EXPECT_NEAR(i.imag, 2.0f, 0.01); + i *= complex(3, 2); + EXPECT_NEAR(i.real, 2.0f, 0.01); + EXPECT_NEAR(i.imag, 10.0f, 0.01); + i /= complex(3, 2); + EXPECT_NEAR(i.real, 2.0f, 0.01); + EXPECT_NEAR(i.imag, 2.0f, 0.01); + + // *********** complex ************* + complex a1 = complex(1, 1) + complex(1, 1); + EXPECT_NEAR(a1.real, 2, 0.001); + EXPECT_NEAR(a1.imag, 2, 0.001); + + complex b1 = complex(-5, -5) + complex(5, 5); + EXPECT_EQ(b1.real, 0); + EXPECT_EQ(b1.imag, 0); + + complex c1 = + complex(0.33333f, 0.33333f) + complex(0.66667f, 0.66667f); + EXPECT_NEAR(c1.real, 1.0f, 0.01); + EXPECT_NEAR(c1.imag, 1.0f, 0.01); + + complex d1 = complex(3) - complex(5); + EXPECT_EQ(d1.real, -2); + EXPECT_EQ(d1.imag, 0); + + complex e1 = + complex(0.66667f, 0.66667f) - complex(0.33333f, 0.33333f); + EXPECT_NEAR(e1.real, 0.33334f, 0.01); + EXPECT_NEAR(e1.imag, 0.33334f, 0.01); + + complex f1 = + complex(0.33f, 0.33f) * complex(0.2f, 0.2f); + EXPECT_NEAR(f1.real, 0.0f, 0.01); + EXPECT_NEAR(f1.imag, 0.132f, 0.01); + + complex g1 = + complex(0.33f, 0.33f) / complex(0.2f, 0.2f); + EXPECT_NEAR(g1.real, 1.65f, 0.01); + EXPECT_NEAR(g1.imag, 0.0f, 0.01); + + complex h1 = -complex(0.33f, 0.33f); + EXPECT_NEAR(h1.real, -0.33f, 0.01); + EXPECT_NEAR(h1.imag, -0.33f, 0.01); + h1 = -complex(-0.33f, -0.33f); + EXPECT_NEAR(h1.real, 0.33f, 0.01); + EXPECT_NEAR(h1.imag, 0.33f, 0.01); + + complex i1 = complex(1.0, 1.0); + i1 += complex(2.0, 2.0); + EXPECT_NEAR(i1.real, 3.0f, 0.01); + EXPECT_NEAR(i1.imag, 3.0f, 0.01); + i1 -= complex(1.0, 1.0); + EXPECT_NEAR(i1.real, 2.0f, 0.01); + EXPECT_NEAR(i1.imag, 2.0f, 0.01); + i1 *= complex(3, 2); + EXPECT_NEAR(i1.real, 2.0f, 0.01); + EXPECT_NEAR(i1.imag, 10.0f, 0.01); + i1 /= complex(3, 2); + EXPECT_NEAR(i1.real, 2.0f, 0.01); + EXPECT_NEAR(i1.imag, 2.0f, 0.01); +} + +TEST(complex, print) { + complex a(1.0f); + std::cout << a << std::endl; + + complex b(1.0); + std::cout << b << std::endl; +} + +TEST(complex, isinf) { + // *********** complex ************* + complex a; + a.real = float(INFINITY); + EXPECT_EQ(std::isinf(a), true); + a.imag = float(INFINITY); + EXPECT_EQ(std::isinf(a), true); + + complex b = float(INFINITY); + EXPECT_EQ(std::isinf(b), true); + + complex c(float(INFINITY), 0); + EXPECT_EQ(std::isinf(c), true); + + // *********** complex ************* + complex a1; + a1.real = double(INFINITY); + EXPECT_EQ(std::isinf(a1), true); + a1.imag = double(INFINITY); + EXPECT_EQ(std::isinf(a1), true); + + complex b1 = double(INFINITY); + EXPECT_EQ(std::isinf(b1), true); + + complex c1(double(INFINITY), 0); + EXPECT_EQ(std::isinf(c1), true); +} + +TEST(complex, isnan) { + // *********** complex ************* + complex a; + a.real = float(NAN); + EXPECT_EQ(std::isnan(a), true); + a.imag = float(NAN); + EXPECT_EQ(std::isnan(a), true); + + complex b = float(NAN); + EXPECT_EQ(std::isnan(b), true); + + complex c(float(NAN), 0); + EXPECT_EQ(std::isnan(c), true); + + // *********** complex ************* + complex a1; + a1.real = double(NAN); + EXPECT_EQ(std::isnan(a1), true); + a1.imag = double(NAN); + EXPECT_EQ(std::isnan(a1), true); + + complex b1 = double(NAN); + EXPECT_EQ(std::isnan(b1), true); + + complex c1(double(NAN), 0); + EXPECT_EQ(std::isnan(c1), true); +} + +} // namespace platform +} // namespace paddle +#endif \ No newline at end of file diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index 923c97350e89ea9a3de01120bb7df57766247a38..6405b556217660bc0efb52eef33c83a3aceafc80 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -104,6 +104,23 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } +size_t NPUPinnedMaxAllocSize() { + // For distributed systems, it requires configuring and limiting + // the fraction of memory to use. + return FLAGS_fraction_of_cuda_pinned_memory_to_use * CpuTotalPhysicalMemory(); +} + +size_t NPUPinnedMinChunkSize() { + // Allow to allocate the minimum chunk size is 64 KB. + return 1 << 16; +} + +size_t NPUPinnedMaxChunkSize() { + // Allow to allocate the maximum chunk size is roughly 1/256 of NPU_PINNED + // memory. + return NPUPinnedMaxAllocSize() / 256; +} + #ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 94527149d4e0b459dee03375d56fb0a9526aa055..29dc0a15aaea11c77f926877ab01abadc5ea3a73 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -73,6 +73,15 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); +//! Get the maximum allocation size for a machine. +size_t NPUPinnedMaxAllocSize(); + +//! Get the minimum chunk size for buddy allocator. +size_t NPUPinnedMinChunkSize(); + +//! Get the maximum chunk size for buddy allocator. +size_t NPUPinnedMaxChunkSize(); + typedef enum { isa_any, sse42, diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index dde9531e59144218c91d789a8fe668d3fffb70f2..352143302388a9f8169a40a14ccea9bae647cfc6 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -16,8 +16,7 @@ limitations under the License. */ // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -32,6 +31,7 @@ namespace platform { #endif inline static int RoundToPowerOfTwo(int dim) { +#ifdef PADDLE_WITH_CUDA if (dim > 512) { return 1024; } else if (dim > 256) { @@ -45,6 +45,17 @@ inline static int RoundToPowerOfTwo(int dim) { } else { return 32; } +#else // HIP results in error or nan if > 256 + if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; + } +#endif } #define CUDA_LAUNCH_KERNEL_BASE(dim, ...) \ @@ -82,28 +93,52 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, #endif } -// CUDA 9.0 have native compatible float16 shfl_down #if defined(PADDLE_WITH_HIP) template <> __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, float16 val, int delta, int width) { -#ifdef PADDLE_WITH_HIP return float16(__shfl_down(static_cast(val), static_cast(delta), width)); -#else - return float16( - __shfl_down(static_cast(val), static_cast(delta), width)); -#endif } + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( + unsigned mask, paddle::platform::complex val, int delta, int width) { + float real = __shfl_down(val.real, delta, width); + float imag = __shfl_down(val.imag, delta, width); + return paddle::platform::complex(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex +CudaShuffleDownSync(unsigned mask, paddle::platform::complex val, + int delta, int width) { + double real = __shfl_down(val.real, delta, width); + double imag = __shfl_down(val.imag, delta, width); + return paddle::platform::complex(real, imag); +} + template <> __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, float16 val, int width) { -#ifdef PADDLE_WITH_HIP return float16(__shfl_xor(static_cast(val), width)); -#else - return float16(__shfl_xor(static_cast(val), width)); -#endif +} + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { + float real = __shfl_xor(val.real, width); + float imag = __shfl_xor(val.imag, width); + return paddle::platform::complex(real, imag); +} + +template <> +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { + double real = __shfl_xor(val.real, width); + double imag = __shfl_xor(val.imag, width); + return paddle::platform::complex(real, imag); } #else template <> @@ -115,25 +150,26 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, } template <> -__forceinline__ __device__ paddle::platform::complex64 CudaShuffleDownSync( - unsigned mask, paddle::platform::complex64 val, int delta, int width) { +__forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( + unsigned mask, paddle::platform::complex val, int delta, int width) { float real = static_cast(__shfl_down_sync( mask, static_cast(val.real), static_cast(delta), width)); float imag = static_cast(__shfl_down_sync( mask, static_cast(val.imag), static_cast(delta), width)); - return paddle::platform::complex64(real, imag); + return paddle::platform::complex(real, imag); } template <> -__forceinline__ __device__ paddle::platform::complex128 CudaShuffleDownSync( - unsigned mask, paddle::platform::complex128 val, int delta, int width) { +__forceinline__ __device__ paddle::platform::complex +CudaShuffleDownSync(unsigned mask, paddle::platform::complex val, + int delta, int width) { double real = static_cast( __shfl_down_sync(mask, static_cast(val.real), static_cast(delta), width)); double imag = static_cast( __shfl_down_sync(mask, static_cast(val.imag), static_cast(delta), width)); - return paddle::platform::complex128(real, imag); + return paddle::platform::complex(real, imag); } template <> @@ -143,23 +179,23 @@ __forceinline__ __device__ float16 CudaShuffleXorSync(unsigned mask, } template <> -__forceinline__ __device__ paddle::platform::complex64 CudaShuffleXorSync( - unsigned mask, paddle::platform::complex64 val, int width) { +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { float real = static_cast( __shfl_xor_sync(mask, static_cast(val.real), width)); float imag = static_cast( __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex64(real, imag); + return paddle::platform::complex(real, imag); } template <> -__forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync( - unsigned mask, paddle::platform::complex128 val, int width) { +__forceinline__ __device__ paddle::platform::complex CudaShuffleXorSync( + unsigned mask, paddle::platform::complex val, int width) { double real = static_cast( __shfl_xor_sync(mask, static_cast(val.real), width)); double imag = static_cast( __shfl_xor_sync(mask, static_cast(val.imag), width)); - return paddle::platform::complex128(real, imag); + return paddle::platform::complex(real, imag); } #endif diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 94f64d158afbcbc702e5c1a47cefb61a9118067b..4708a99e8fc4ca9682500602da95a710d34e268e 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -20,8 +20,7 @@ limitations under the License. */ #include #endif #include -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -135,18 +134,18 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { } #endif -CUDA_ATOMIC_WRAPPER(Add, complex64) { +CUDA_ATOMIC_WRAPPER(Add, complex) { float *real = reinterpret_cast(address); float *imag = real + 1; - return complex64(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); + return complex(CudaAtomicAdd(real, val.real), + CudaAtomicAdd(imag, val.imag)); } -CUDA_ATOMIC_WRAPPER(Add, complex128) { +CUDA_ATOMIC_WRAPPER(Add, complex) { double *real = reinterpret_cast(address); double *imag = real + 1; - return complex128(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); + return complex(CudaAtomicAdd(real, val.real), + CudaAtomicAdd(imag, val.imag)); } // For atomicMax diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h index 05a431e731e32c2b36f0aebfa11cb95f2607929c..8e969588afbbcf5d49f71f5165668cb7fb946e6c 100644 --- a/paddle/fluid/platform/cudnn_desc.h +++ b/paddle/fluid/platform/cudnn_desc.h @@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType( case framework::proto::VarType::FP64: type = CUDNN_DATA_DOUBLE; break; +#if CUDNN_VERSION_MIN(8, 1, 0) + case framework::proto::VarType::BF16: + type = CUDNN_DATA_BFLOAT16; + break; +#endif default: break; } diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 6c3c96b68c48a1314f4a90a97a2542ea3060446a..65dd69a37d37f8116deee0e63ab89d9249f908ba 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -34,35 +34,6 @@ DECLARE_bool(cudnn_deterministic); namespace paddle { namespace platform { -inline const char* cudnnGetErrorString(cudnnStatus_t status) { - switch (status) { - case CUDNN_STATUS_SUCCESS: - return "CUDNN_STATUS_SUCCESS"; - case CUDNN_STATUS_NOT_INITIALIZED: - return "CUDNN_STATUS_NOT_INITIALIZED"; - case CUDNN_STATUS_ALLOC_FAILED: - return "CUDNN_STATUS_ALLOC_FAILED"; - case CUDNN_STATUS_BAD_PARAM: - return "CUDNN_STATUS_BAD_PARAM"; - case CUDNN_STATUS_INTERNAL_ERROR: - return "CUDNN_STATUS_INTERNAL_ERROR"; - case CUDNN_STATUS_INVALID_VALUE: - return "CUDNN_STATUS_INVALID_VALUE"; - case CUDNN_STATUS_ARCH_MISMATCH: - return "CUDNN_STATUS_ARCH_MISMATCH"; - case CUDNN_STATUS_MAPPING_ERROR: - return "CUDNN_STATUS_MAPPING_ERROR"; - case CUDNN_STATUS_EXECUTION_FAILED: - return "CUDNN_STATUS_EXECUTION_FAILED"; - case CUDNN_STATUS_NOT_SUPPORTED: - return "CUDNN_STATUS_NOT_SUPPORTED"; - case CUDNN_STATUS_LICENSE_ERROR: - return "CUDNN_STATUS_LICENSE_ERROR"; - default: - return "Unknown cudnn error number"; - } -} - #define CUDNN_VERSION_MIN(major, minor, patch) \ (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch))) @@ -131,6 +102,25 @@ inline ActivationMode StringToActivationMode(const std::string& str) { template class CudnnDataType; +// CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1 +#if CUDNN_VERSION_MIN(8, 1, 0) +template <> +class CudnnDataType { + public: + static const cudnnDataType_t type = CUDNN_DATA_BFLOAT16; + using ScalingParamType = const float; + using BatchNormParamType = float; + static ScalingParamType* kOne() { + static ScalingParamType v = 1.0; + return &v; + } + static ScalingParamType* kZero() { + static ScalingParamType v = 0.0; + return &v; + } +}; +#endif + template <> class CudnnDataType { public: diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 50bb64d5574440a9565793e578322f171b6586a1..1179677fd6b9f57152cf7821f6fd088b8945c129 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -153,6 +153,16 @@ DeviceContextPool::DeviceContextPool( PADDLE_THROW(platform::errors::Unimplemented( "NPUPlace is not supported. Please " "re-compile with WITH_ASCEND_CL option.")); +#endif + } else if (platform::is_npu_pinned_place(p)) { +#ifdef PADDLE_WITH_ASCEND_CL + EmplaceDeviceContext( + &device_contexts_, p); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPinnedPlace is not supported. Please re-compile with " + "WITH_ASCEND_CL " + "option.")); #endif } } @@ -264,6 +274,22 @@ aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } Place NPUDeviceContext::GetPlace() const { return place_; } aclrtContext NPUDeviceContext::context() const { return context_; } + +NPUPinnedDeviceContext::NPUPinnedDeviceContext() { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +NPUPinnedDeviceContext::NPUPinnedDeviceContext(NPUPinnedPlace place) + : place_(place) { + eigen_device_.reset(new Eigen::DefaultDevice()); +} + +Eigen::DefaultDevice* NPUPinnedDeviceContext::eigen_device() const { + return eigen_device_.get(); +} + +Place NPUPinnedDeviceContext::GetPlace() const { return place_; } + #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -537,6 +563,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) : CPUDeviceContext(place), p_blobmap_() { p_blobmap_.reset(new BlobMap()); + p_exec_items_.reset(new ExecShape()); p_mutex_.reset(new std::mutex()); } @@ -560,7 +587,7 @@ MKLDNNDeviceContextThreadLocals::Body::~Body() { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::MKLDNNDeviceContext* dev_ctx = (platform::MKLDNNDeviceContext*)pool.Get(cpu_place); - dev_ctx->ResetBlobMap(); + dev_ctx->ResetBlobMap(exec_ptr_); } void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id( @@ -607,17 +634,52 @@ mkldnn::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) { return cur_stream; } -void MKLDNNDeviceContext::ResetBlobMap() { +void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { std::lock_guard lock(*p_mutex_); if (!block_next_cache_clearing_) { VLOG(3) << "Clearing DNNL cache."; - p_blobmap_->clear(); + // If no specific executor pointer then clear + // everything. For executor pointer then clear only + // objects allocated when using given executor + if (ptr == nullptr) { + p_blobmap_->clear(); + } else { + // Iterate through all shapes and release + // for each shape and active executor all entries + // of this executor + for (auto& s : *p_exec_items_) { + for (auto& v : (*s.second)[ptr]) { + (v.first)->erase(v.second); + } + s.second->erase(ptr); + } + } } else { VLOG(3) << "Prevented Clearing DNNL cache."; block_next_cache_clearing_ = false; } } +void MKLDNNDeviceContext::RemoveShapeEntriesWithExecutor(void) const { + p_exec_items_->erase(p_exec_items_->begin()); +} + +void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t pblob, + KeyBlob::iterator it) const { + // Take current input shape from TLS + // Take current executor addess from TLS + // and for this executor's items add the one defined with arguments + auto key_it = p_exec_items_ + ->insert(std::make_pair(tls().cur_input_shape_str, + std::make_shared())) + .first; + (*key_it->second)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it)); + + VLOG(3) << "LinkEntryWithExecutor, shapes: " << p_exec_items_->size() + << " curr exec size: " + << (*key_it->second)[tls().get_curr_exec()].size() << "\n"; +} + void MKLDNNDeviceContext::BlockNextCacheClearing() { std::lock_guard lock(*p_mutex_); VLOG(3) << "Next DNNL cache clearing has been blocked."; @@ -672,6 +734,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, VLOG(2) << "sid=" << sid << ", remove all blobs of shape: " << sBlob->begin()->first; sBlob->erase(sBlob->begin()->first); + RemoveShapeEntriesWithExecutor(); } pBlob = std::make_shared(); (*sBlob)[tls().cur_input_shape_str] = pBlob; @@ -682,7 +745,11 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, // Find Blob via name auto blob_it = pBlob->find(name); if (blob_it == pBlob->end()) { - (*pBlob)[name] = data; + auto el = + pBlob->insert(std::make_pair(name, data)); // (*pBlob)[name] = data; + // Register new element in per executor map + // to have easily erased when executor terminated + LinkEntryWithExecutor(pBlob, el.first); } else { blob_it->second = data; // set data to existing blob } @@ -691,7 +758,7 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, return; } -unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) { +unsigned int MKLDNNDeviceContext::GetCachedObjectsNumber(void) const { unsigned int num_entries = 0; for (auto const& l3 : *p_blobmap_) { for (auto const& l2 : *(l3.second)) { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index f79cb1ab94788126a562764ac6ff7efc4b302d2e..e2dbc90b5d1444b7f27ac00439a769ee3165a911 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -233,6 +233,27 @@ template <> struct DefaultDeviceContextType { using TYPE = NPUDeviceContext; }; + +// Currently, NPUPinnedDeviceContext is only used to data copying. +class NPUPinnedDeviceContext : public DeviceContext { + public: + NPUPinnedDeviceContext(); + explicit NPUPinnedDeviceContext(NPUPinnedPlace place); + + Place GetPlace() const override; + + Eigen::DefaultDevice* eigen_device() const; + + private: + NPUPinnedPlace place_; + std::unique_ptr eigen_device_; +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = NPUPinnedDeviceContext; +}; + #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -337,15 +358,16 @@ class CUDAContext { PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenGetVersion( &miopen_major, &miopen_minor, &miopen_patch)); auto local_miopen_version = - (miopen_major * 1000 + miopen_minor * 100 + miopen_patch) / 100; - auto compile_miopen_version = MIOPEN_VERSION / 100; + (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10; + auto compile_miopen_version = MIOPEN_VERSION / 10; if (local_miopen_version < static_cast(compile_miopen_version)) { LOG_FIRST_N(WARNING, 1) << "WARNING: device: " << place_.device << ". The installed Paddle is compiled with MIOPEN " - << compile_miopen_version / 10 << "." << compile_miopen_version % 10 + << compile_miopen_version / 100 << "." + << compile_miopen_version % 100 << ", but MIOPEN version in your machine is " - << local_miopen_version / 10 << "." << local_miopen_version % 10 + << local_miopen_version / 100 << "." << local_miopen_version % 100 << ", which may cause serious incompatible bug. " << "Please recompile or reinstall Paddle with compatible MIOPEN " "version."; @@ -673,6 +695,7 @@ class MKLDNNDeviceContextThreadLocals { mkldnn::stream cur_stream; std::string key_suffix; // Key identifying current Executor bool key_attach_thread_id = true; + void* exec_ptr_ = nullptr; Body(); ~Body(); @@ -689,6 +712,8 @@ class MKLDNNDeviceContextThreadLocals { const std::string& get_key_suffix(void) const { return key_suffix; } void disable_tid_in_key(void) { key_attach_thread_id = false; } bool is_tid_used_in_key(void) const { return key_attach_thread_id; } + void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; } + void* get_curr_exec(void) const { return exec_ptr_; } }; MKLDNNDeviceContextThreadLocals() = default; MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) = @@ -724,13 +749,26 @@ class MKLDNNDeviceContext : public CPUDeviceContext { using ShapeBlob = umap_key_string_t; using BlobMap = umap_value_smart_t; + // Auxillary two-level structure (shape, executor) to easier control + // clearing cache objects related to specific executor + + using ExecKey = void*; + using ExecMapCacheIterPair = std::pair, KeyBlob::iterator>; + using ExecMap = + std::unordered_map>; + using ExecShape = std::unordered_map>; + explicit MKLDNNDeviceContext(CPUPlace place); /* \brief Get the active engine */ const mkldnn::engine& GetEngine() const { return tls().get_engine(); } + // Register object to currently used executor's map + void LinkEntryWithExecutor(BlobPtr_t, KeyBlob::iterator) const; + void RemoveShapeEntriesWithExecutor(void) const; + // Remove all entries from the blob map - void ResetBlobMap(); + void ResetBlobMap(void* ptr); // Prevent next ResetBlobMap() void BlockNextCacheClearing(); @@ -742,7 +780,7 @@ class MKLDNNDeviceContext : public CPUDeviceContext { void SetBlob(const std::string& name, std::shared_ptr data) const; // Calculate number of oneDNN objects cached - unsigned int GetCachedObjectsNumber(void); + unsigned int GetCachedObjectsNumber(void) const; // Find a saved blob. Return nullptr if not found std::shared_ptr GetBlob(const std::string& name) const; @@ -753,6 +791,9 @@ class MKLDNNDeviceContext : public CPUDeviceContext { private: std::shared_ptr p_blobmap_; + // Map key is pointer of executor and value is a data(iterator in map) needed + // to erase + std::shared_ptr p_exec_items_; std::shared_ptr p_mutex_; bool block_next_cache_clearing_ = false; }; diff --git a/paddle/fluid/platform/device_memory_aligment.cc b/paddle/fluid/platform/device_memory_aligment.cc index f8e031104415e848101d97d2f66217847630c923..383dbd23ca0a59ab6c7289ae18d04ec11d429661 100644 --- a/paddle/fluid/platform/device_memory_aligment.cc +++ b/paddle/fluid/platform/device_memory_aligment.cc @@ -16,20 +16,26 @@ limitations under the License. */ namespace paddle { namespace platform { -size_t Alignment(size_t size, const platform::Place &place) { - size_t alignment = 1024; - if (platform::is_cpu_place(place)) { - alignment = CpuMinChunkSize(); +size_t Alignment(size_t size, const platform::Place &place, int align_size) { + size_t alignment = 0; + if (align_size > 0) { + alignment = align_size; } else { + alignment = 1024; + if (platform::is_cpu_place(place)) { + alignment = CpuMinChunkSize(); + } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - alignment = GpuMinChunkSize(); + alignment = GpuMinChunkSize(); #elif defined(PADDLE_WITH_XPU) - // TODO(wangxi): add XpuMinChunkSize - alignment = alignment; + alignment = alignment; +#elif defined(PADDLE_WITH_ASCEND_CL) + alignment = NPUMinChunkSize(); #else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Fluid is not compiled with CUDA.")); + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Fluid is not compiled with CUDA/XPU/NPU.")); #endif + } } size_t remaining = size % alignment; return remaining == 0 ? size : size + (alignment - remaining); diff --git a/paddle/fluid/platform/device_memory_aligment.h b/paddle/fluid/platform/device_memory_aligment.h index a151e434833587549e35c3ccfe1d8d8f43469a76..dda526a7557c261659cf6228291f2b1260d5a943 100644 --- a/paddle/fluid/platform/device_memory_aligment.h +++ b/paddle/fluid/platform/device_memory_aligment.h @@ -19,10 +19,16 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/gpu_info.h" +#elif defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/npu_info.h" +#endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/npu_info.h" #endif namespace paddle { namespace platform { -size_t Alignment(size_t size, const platform::Place &place); +size_t Alignment(size_t size, const platform::Place &place, + int align_size = -1); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 724a9b8483cdee5d98cd2988aea7e57c9bfc8ff5..1bd46c0bfafaab92a2217751ee80ce1872af4474 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -511,7 +511,7 @@ class DeviceTracerImpl : public DeviceTracer { auto c = correlations_.find(r.correlation_id); if (c != correlations_.end() && c->second != nullptr) { event->set_name(c->second->name()); - event->set_detail_info(r.name); + event->set_detail_info(c->second->attr()); find++; } else { VLOG(10) << "Missing Kernel Event: " + r.name; diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index b25fb5978d055da2314621d9d0ddac52cbe37e6b..21d9e8607459a484328c785242f4112cc3951263 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -2,6 +2,10 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc) +if (NOT WITH_NV_JETSON) + list(APPEND CUDA_SRCS nvjpeg.cc) +endif() + if (WITH_ROCM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) endif() diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index b49875f256bb26f5bc99031cf1f85284b30673b3..f0a46e0818af748b37e0abce44096fe3cf73b126 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -100,6 +100,9 @@ static constexpr char* win_cublas_lib = static constexpr char* win_curand_lib = "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll"; +static constexpr char* win_nvjpeg_lib = + "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll"; static constexpr char* win_cusolver_lib = "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll"; @@ -107,6 +110,9 @@ static constexpr char* win_cusolver_lib = static constexpr char* win_curand_lib = "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;curand64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_nvjpeg_lib = + "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll"; static constexpr char* win_cusolver_lib = "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll"; @@ -213,17 +219,17 @@ static inline void* GetDsoHandleFromSearchPath( for (auto dso : dso_names) { // 1. search in user config path by FLAGS dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags); - // 2. search in extra paths + // 2. search in system default path + if (nullptr == dso_handle) { + dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags); + } + // 3. search in extra paths if (nullptr == dso_handle) { for (auto path : extra_paths) { VLOG(3) << "extra_paths: " << path; dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags); } } - // 3. search in system default path - if (nullptr == dso_handle) { - dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags); - } if (nullptr != dso_handle) break; } @@ -330,6 +336,17 @@ void* GetCurandDsoHandle() { #endif } +void* GetNvjpegDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true, + {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so"); +#endif +} + void* GetCusolverDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index 8424160931690624a291275a20700a158ee61ad4..9ab6dca0126bcbdd02625e2f263ad7c466b5e966 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -29,6 +29,7 @@ void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); void* GetCurandDsoHandle(); +void* GetNvjpegDsoHandle(); void* GetCusolverDsoHandle(); void* GetNVRTCDsoHandle(); void* GetCUDADsoHandle(); diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index 5ff4bff4bff6527140369307067f958c2cb16866..f72eb6731f6276c049b2fe397cda660fd61c1def 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/port.h" -#define MIOPEN_VERSION \ - (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 100 + \ +#define MIOPEN_VERSION \ + (MIOPEN_VERSION_MAJOR * 1000 + MIOPEN_VERSION_MINOR * 10 + \ MIOPEN_VERSION_PATCH) // NOLINT namespace paddle { @@ -110,6 +110,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(miopenActivationBackward); \ __macro(miopenConvolutionBackwardWeights); \ __macro(miopenConvolutionForward); \ + __macro(miopenConvolutionForwardBias); \ __macro(miopenConvolutionBackwardBias); \ __macro(miopenConvolutionForwardGetWorkSpaceSize); \ __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \ diff --git a/paddle/fluid/operators/log_loss_op.cu b/paddle/fluid/platform/dynload/nvjpeg.cc similarity index 57% rename from paddle/fluid/operators/log_loss_op.cu rename to paddle/fluid/platform/dynload/nvjpeg.cc index 280913c43a2749ddd5fbd3ae1905f1b823dd525d..eb0ad78b9b73cd38e2d6dd1f58433da41094dd3f 100644 --- a/paddle/fluid/operators/log_loss_op.cu +++ b/paddle/fluid/platform/dynload/nvjpeg.cc @@ -1,21 +1,27 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/log_loss_op.h" -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - log_loss, ops::LogLossKernel); -REGISTER_OP_CUDA_KERNEL( - log_loss_grad, - ops::LogLossGradKernel); +#include "paddle/fluid/platform/dynload/nvjpeg.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nvjpeg_dso_flag; +void *nvjpeg_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h new file mode 100644 index 0000000000000000000000000000000000000000..ae457b2958f5deff9d879b012a0e06108d86c830 --- /dev/null +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_CUDA +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag nvjpeg_dso_flag; +extern void *nvjpeg_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + nvjpegStatus_t operator()(Args... args) { \ + using nvjpegFunc = decltype(&::__name); \ + std::call_once(nvjpeg_dso_flag, []() { \ + nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ + __macro(nvjpegCreateSimple); \ + __macro(nvjpegJpegStateCreate); \ + __macro(nvjpegGetImageInfo); \ + __macro(nvjpegJpegStateDestroy); \ + __macro(nvjpegDecode); + +NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc index 1d105a1fd8682552b5b8e375e9d94206fe84ee98..8153877b7bbb892a5c108316f7fe28510fc64b79 100644 --- a/paddle/fluid/platform/dynload/tensorrt.cc +++ b/paddle/fluid/platform/dynload/tensorrt.cc @@ -43,8 +43,17 @@ void* GetDsoHandle(const std::string& dso_name) { if (nullptr == dso_handle) { auto error_msg = "You are using Paddle compiled with TensorRT, but TensorRT dynamic " - "library is not found. Ignore this if TensorRT is not needed.\n"; - std::cerr << error_msg; + "library is not found. Ignore this if TensorRT is not needed.\n" + "The TensorRT that Paddle depends on is not configured correctly.\n" + " Suggestions:\n" + " 1. Check if the TensorRT is installed correctly and its version" + " is matched with paddlepaddle you installed.\n" + " 2. Configure environment variables as " + "follows:\n" + " - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n" + " - Windows: set PATH by `set PATH=XXX;%PATH%`\n" + " - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...`\n"; + LOG(WARNING) << error_msg; } return dso_handle; } diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h index 0db4cc71b1b21085513c4703475e651b8d8edd74..2b3d1693f6245e511e734b7015af9a2614e9d80f 100644 --- a/paddle/fluid/platform/eigen_ext.h +++ b/paddle/fluid/platform/eigen_ext.h @@ -15,8 +15,7 @@ #pragma once #include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/hostdevice.h" @@ -24,9 +23,9 @@ namespace Eigen { -using complex64 = paddle::platform::complex64; -using complex128 = paddle::platform::complex128; using float16 = paddle::platform::float16; +template +using complex = paddle::platform::complex; template struct NumTraits; @@ -62,7 +61,7 @@ struct NumTraits }; template <> -struct NumTraits : GenericNumTraits> { +struct NumTraits> : GenericNumTraits> { typedef float Real; typedef typename NumTraits::Literal Literal; enum { @@ -84,7 +83,7 @@ struct NumTraits : GenericNumTraits> { }; template <> -struct NumTraits : GenericNumTraits> { +struct NumTraits> : GenericNumTraits> { typedef double Real; typedef typename NumTraits::Literal Literal; enum { @@ -157,6 +156,12 @@ HOSTDEVICE inline paddle::platform::bfloat16 exp( return paddle::platform::bfloat16(::expf(static_cast(a))); } +template <> +HOSTDEVICE inline paddle::platform::bfloat16 expm1( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::expm1f(static_cast(a))); +} + template <> HOSTDEVICE inline paddle::platform::bfloat16 erf( const paddle::platform::bfloat16& a) { @@ -224,133 +229,135 @@ HOSTDEVICE inline paddle::platform::bfloat16 maxi( return a < b ? b : a; } -//////////// complex64 methods ///////////// +//////////// complex methods ///////////// template <> -HOSTDEVICE inline bool(isnan)(const complex64& a) { +HOSTDEVICE inline bool(isnan)(const complex& a) { return (paddle::platform::isnan)(a); } template <> -HOSTDEVICE inline bool(isinf)(const complex64& a) { +HOSTDEVICE inline bool(isinf)(const complex& a) { return (paddle::platform::isinf)(a); } template <> -HOSTDEVICE inline bool(isfinite)(const complex64& a) { +HOSTDEVICE inline bool(isfinite)(const complex& a) { return (paddle::platform::isfinite)(a); } template <> -HOSTDEVICE inline complex64 exp(const complex64& a) { +HOSTDEVICE inline complex exp(const complex& a) { float com = ::expf(a.real); float res_real = com * ::cosf(a.imag); float res_imag = com * ::sinf(a.imag); - return complex64(res_real, res_imag); + return complex(res_real, res_imag); } template <> -HOSTDEVICE inline complex64 log(const complex64& a) { +HOSTDEVICE inline complex log(const complex& a) { return paddle::platform::log(a); } template <> -HOSTDEVICE inline complex64 tanh(const complex64& a) { +HOSTDEVICE inline complex tanh(const complex& a) { return paddle::platform::tanh(a); } template <> -HOSTDEVICE inline complex64 sqrt(const complex64& a) { +HOSTDEVICE inline complex sqrt(const complex& a) { return paddle::platform::sqrt(a); } template <> -HOSTDEVICE inline complex64 ceil(const complex64& a) { - return complex64(::ceilf(a.real), ::ceilf(a.imag)); +HOSTDEVICE inline complex ceil(const complex& a) { + return complex(::ceilf(a.real), ::ceilf(a.imag)); } template <> -HOSTDEVICE inline complex64 floor(const complex64& a) { - return complex64(::floorf(a.real), ::floor(a.imag)); +HOSTDEVICE inline complex floor(const complex& a) { + return complex(::floorf(a.real), ::floor(a.imag)); } template <> -HOSTDEVICE inline complex64 round(const complex64& a) { - return complex64(::roundf(a.real), ::roundf(a.imag)); +HOSTDEVICE inline complex round(const complex& a) { + return complex(::roundf(a.real), ::roundf(a.imag)); } template <> -HOSTDEVICE inline complex64 pow(const complex64& a, const complex64& b) { +HOSTDEVICE inline complex pow(const complex& a, + const complex& b) { return paddle::platform::pow(a, b); } template <> -HOSTDEVICE inline float abs(const complex64& a) { +HOSTDEVICE inline float abs(const complex& a) { return paddle::platform::abs(a); } -//////////// complex128 methods ///////////// +//////////// complex methods ///////////// template <> -HOSTDEVICE inline bool(isnan)(const complex128& a) { +HOSTDEVICE inline bool(isnan)(const complex& a) { return (paddle::platform::isnan)(a); } template <> -HOSTDEVICE inline bool(isinf)(const complex128& a) { +HOSTDEVICE inline bool(isinf)(const complex& a) { return (paddle::platform::isinf)(a); } template <> -HOSTDEVICE inline bool(isfinite)(const complex128& a) { +HOSTDEVICE inline bool(isfinite)(const complex& a) { return (paddle::platform::isfinite)(a); } template <> -HOSTDEVICE inline complex128 exp(const complex128& a) { +HOSTDEVICE inline complex exp(const complex& a) { double com = ::expf(a.real); double res_real = com * ::cosf(a.imag); double res_imag = com * ::sinf(a.imag); - return complex128(res_real, res_imag); + return complex(res_real, res_imag); } template <> -HOSTDEVICE inline complex128 log(const complex128& a) { +HOSTDEVICE inline complex log(const complex& a) { return paddle::platform::log(a); } template <> -HOSTDEVICE inline complex128 tanh(const complex128& a) { +HOSTDEVICE inline complex tanh(const complex& a) { return paddle::platform::tanh(a); } template <> -HOSTDEVICE inline complex128 sqrt(const complex128& a) { +HOSTDEVICE inline complex sqrt(const complex& a) { return paddle::platform::sqrt(a); } template <> -HOSTDEVICE inline complex128 ceil(const complex128& a) { - return complex128(::ceilf(a.real), ::ceilf(a.imag)); +HOSTDEVICE inline complex ceil(const complex& a) { + return complex(::ceilf(a.real), ::ceilf(a.imag)); } template <> -HOSTDEVICE inline complex128 floor(const complex128& a) { - return complex128(::floorf(a.real), ::floor(a.imag)); +HOSTDEVICE inline complex floor(const complex& a) { + return complex(::floorf(a.real), ::floor(a.imag)); } template <> -HOSTDEVICE inline complex128 round(const complex128& a) { - return complex128(::roundf(a.real), ::roundf(a.imag)); +HOSTDEVICE inline complex round(const complex& a) { + return complex(::roundf(a.real), ::roundf(a.imag)); } template <> -HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) { +HOSTDEVICE inline complex pow(const complex& a, + const complex& b) { return paddle::platform::pow(a, b); } template <> -HOSTDEVICE inline double abs(const complex128& a) { +HOSTDEVICE inline double abs(const complex& a) { return paddle::platform::abs(a); } @@ -376,6 +383,11 @@ HOSTDEVICE inline float16 exp(const float16& a) { return float16(::expf(static_cast(a))); } +template <> +HOSTDEVICE inline float16 expm1(const float16& a) { + return float16(::expm1f(static_cast(a))); +} + template <> HOSTDEVICE inline float16 erf(const float16& a) { return float16(::erff(static_cast(a))); diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index cfca3ceadf41a2e769569da7f56ac01d56ad2341..c63ea3fa8573b8a7fd739931869c8f53259d8a77 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -34,7 +34,7 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/platform/cuda_error.pb.h" +#include "paddle/fluid/platform/external_error.pb.h" #endif // PADDLE_WITH_CUDA #ifdef PADDLE_WITH_HIP @@ -682,41 +682,83 @@ struct EOFException : public std::exception { END_HANDLE_THE_ERROR \ } while (0) -/** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/ +/**************************************************************************/ +/**************************** NVIDIA ERROR ********************************/ #ifdef PADDLE_WITH_CUDA -/***** CUDA ERROR *****/ -inline bool is_error(cudaError_t e) { return e != cudaSuccess; } +namespace details { -inline std::string GetCudaErrorWebsite(int32_t cuda_version) { - std::ostringstream webstr; - webstr << "https://docs.nvidia.com/cuda/"; - if (cuda_version != -1) { - double version = cuda_version / 10; - webstr << "archive/" << std::fixed << std::setprecision(1) << version; +template +struct ExternalApiType {}; + +#define DEFINE_EXTERNAL_API_TYPE(type, success_value, proto_type) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + static constexpr const char* kTypeString = #proto_type; \ + static constexpr platform::proto::ApiType kProtoType = \ + platform::proto::ApiType::proto_type; \ } - webstr << "/cuda-runtime-api/group__CUDART__TYPES.html" - "#group__CUDART__TYPES_1g3f51e3575c2178246db0a94a430e0038"; - return webstr.str(); -} -inline std::string build_nvidia_error_msg(cudaError_t e) { -#if CUDA_VERSION >= 10000 && CUDA_VERSION < 11000 - int32_t cuda_version = 100; -#elif CUDA_VERSION >= 9000 - int32_t cuda_version = 90; -#else - int32_t cuda_version = -1; +DEFINE_EXTERNAL_API_TYPE(cudaError_t, cudaSuccess, CUDA); +DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND); +DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); +DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); +DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); + +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); #endif + +} // namespace details + +template +inline const char* GetErrorMsgUrl(T status) { + using __CUDA_STATUS_TYPE__ = decltype(status); + platform::proto::ApiType proto_type = + details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; + switch (proto_type) { + case platform::proto::ApiType::CUDA: + return "https://docs.nvidia.com/cuda/cuda-runtime-api/" + "group__CUDART__TYPES.html#group__CUDART__TYPES_" + "1g3f51e3575c2178246db0a94a430e0038"; + break; + case platform::proto::ApiType::CURAND: + return "https://docs.nvidia.com/cuda/curand/" + "group__HOST.html#group__HOST_1gb94a31d5c165858c96b6c18b70644437"; + break; + case platform::proto::ApiType::CUDNN: + return "https://docs.nvidia.com/deeplearning/cudnn/api/" + "index.html#cudnnStatus_t"; + break; + case platform::proto::ApiType::CUBLAS: + return "https://docs.nvidia.com/cuda/cublas/index.html#cublasstatus_t"; + break; + case platform::proto::ApiType::CUSOLVER: + return "https://docs.nvidia.com/cuda/cusolver/" + "index.html#cuSolverSPstatus"; + break; + case platform::proto::ApiType::NCCL: + return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/" + "types.html#ncclresult-t"; + break; + default: + return "Unknown type of External API, can't get error message URL!"; + break; + } +} + +template +inline std::string GetExternalErrorMsg(T status) { std::ostringstream sout; - sout << " Cuda error(" << e << "), " << cudaGetErrorString(e) << "."; - static platform::proto::cudaerrorDesc cudaerror; - static bool _initSucceed = false; - if (cudaerror.ByteSizeLong() == 0) { + bool _initSucceed = false; + platform::proto::ExternalErrorDesc externalError; + if (externalError.ByteSizeLong() == 0) { std::string filePath; #if !defined(_WIN32) Dl_info info; - if (dladdr(reinterpret_cast(GetCudaErrorWebsite), &info)) { + if (dladdr(reinterpret_cast(GetCurrentTraceBackString), &info)) { std::string strModule(info.dli_fname); const size_t last_slash_idx = strModule.find_last_of("/"); std::string compare_path = strModule.substr(strModule.length() - 6); @@ -724,21 +766,22 @@ inline std::string build_nvidia_error_msg(cudaError_t e) { strModule.erase(last_slash_idx, std::string::npos); } if (compare_path.compare("avx.so") == 0) { - filePath = strModule + - "/../include/third_party/cudaerror/data/cudaErrorMessage.pb"; - } else { filePath = - strModule + "/../../thirl_party/cudaerror/data/cudaErrorMessage.pb"; + strModule + + "/../include/third_party/externalError/data/externalErrorMsg.pb"; + } else { + filePath = strModule + + "/../../third_party/externalError/data/externalErrorMsg.pb"; } } #else - char buf[100]; + char buf[512]; MEMORY_BASIC_INFORMATION mbi; HMODULE h_module = - (::VirtualQuery(GetCudaErrorWebsite, &mbi, sizeof(mbi)) != 0) + (::VirtualQuery(GetCurrentTraceBackString, &mbi, sizeof(mbi)) != 0) ? (HMODULE)mbi.AllocationBase : NULL; - GetModuleFileName(h_module, buf, 100); + GetModuleFileName(h_module, buf, 512); std::string strModule(buf); const size_t last_slash_idx = strModule.find_last_of("\\"); std::string compare_path = strModule.substr(strModule.length() - 7); @@ -746,198 +789,118 @@ inline std::string build_nvidia_error_msg(cudaError_t e) { strModule.erase(last_slash_idx, std::string::npos); } if (compare_path.compare("avx.pyd") == 0) { - filePath = - strModule + - "\\..\\include\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; + filePath = strModule + + "\\..\\include\\third_" + "party\\externalerror\\data\\externalErrorMsg.pb"; } else { filePath = - strModule + "\\..\\third_party\\cudaerror\\data\\cudaErrorMessage.pb"; + strModule + + "\\..\\..\\third_party\\externalerror\\data\\externalErrorMsg.pb"; } #endif std::ifstream fin(filePath, std::ios::in | std::ios::binary); - _initSucceed = cudaerror.ParseFromIstream(&fin); + _initSucceed = externalError.ParseFromIstream(&fin); } + using __CUDA_STATUS_TYPE__ = decltype(status); + platform::proto::ApiType proto_type = + details::ExternalApiType<__CUDA_STATUS_TYPE__>::kProtoType; if (_initSucceed) { - for (int i = 0; i < cudaerror.allmessages_size(); ++i) { - if (cuda_version == cudaerror.allmessages(i).version()) { - for (int j = 0; j < cudaerror.allmessages(i).messages_size(); ++j) { - if (e == cudaerror.allmessages(i).messages(j).errorcode()) { - sout << "\n [Advise: " - << cudaerror.allmessages(i).messages(j).errormessage() << "]"; + for (int i = 0; i < externalError.errors_size(); ++i) { + if (proto_type == externalError.errors(i).type()) { + for (int j = 0; j < externalError.errors(i).messages_size(); ++j) { + if (status == externalError.errors(i).messages(j).code()) { + sout << "\n [Hint: " + << externalError.errors(i).messages(j).message() << "]"; return sout.str(); } } } } } - sout << "\n [Advise: Please search for the error code(" << e - << ") on website( " << GetCudaErrorWebsite(cuda_version) - << " ) to get Nvidia's official solution about CUDA Error.]"; + + sout << "\n [Hint: Please search for the error code(" << status + << ") on website (" << GetErrorMsgUrl(status) + << ") to get Nvidia's official solution and advice about " + << details::ExternalApiType<__CUDA_STATUS_TYPE__>::kTypeString + << " Error.]"; return sout.str(); } -/** curand ERROR **/ -inline bool is_error(curandStatus_t stat) { - return stat != CURAND_STATUS_SUCCESS; +template std::string GetExternalErrorMsg(cudaError_t); +template std::string GetExternalErrorMsg(curandStatus_t); +template std::string GetExternalErrorMsg(cudnnStatus_t); +template std::string GetExternalErrorMsg(cublasStatus_t); +template std::string GetExternalErrorMsg(cusolverStatus_t); +#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) +template std::string GetExternalErrorMsg(ncclResult_t); +#endif + +/*************** CUDA ERROR ***************/ +inline bool is_error(cudaError_t e) { return e != cudaSuccess; } + +inline std::string build_nvidia_error_msg(cudaError_t e) { + std::ostringstream sout; + sout << "CUDA error(" << e << "), " << cudaGetErrorString(e) << ". " + << GetExternalErrorMsg(e); + return sout.str(); } -inline const char* curandGetErrorString(curandStatus_t stat) { - switch (stat) { - case CURAND_STATUS_SUCCESS: - return "`CURAND_STATUS_SUCCESS`. No errors."; - case CURAND_STATUS_VERSION_MISMATCH: - return "`CURAND_STATUS_VERSION_MISMATCH`. Header file and linked library " - "version do not match."; - case CURAND_STATUS_NOT_INITIALIZED: - return "`CURAND_STATUS_NOT_INITIALIZED`. Generator not initialized."; - case CURAND_STATUS_ALLOCATION_FAILED: - return "`CURAND_STATUS_ALLOCATION_FAILED`. Memory allocation failed."; - case CURAND_STATUS_TYPE_ERROR: - return "`CURAND_STATUS_TYPE_ERROR`. Generator is wrong type."; - case CURAND_STATUS_OUT_OF_RANGE: - return "`CURAND_STATUS_OUT_OF_RANGE`. Argument out of range."; - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "`CURAND_STATUS_LENGTH_NOT_MULTIPLE`. Length requested is not a " - "multple of dimension."; - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "`CURAND_STATUS_DOUBLE_PRECISION_REQUIRED`. GPU does not have " - "double precision required by MRG32k3a."; - case CURAND_STATUS_LAUNCH_FAILURE: - return "`CURAND_STATUS_LAUNCH_FAILURE`. Kernel launch failure."; - case CURAND_STATUS_PREEXISTING_FAILURE: - return "`CURAND_STATUS_PREEXISTING_FAILURE`. Preexisting failure on " - "library entry."; - case CURAND_STATUS_INITIALIZATION_FAILED: - return "`CURAND_STATUS_INITIALIZATION_FAILED`. Initialization of CUDA " - "failed."; - case CURAND_STATUS_ARCH_MISMATCH: - return "`CURAND_STATUS_ARCH_MISMATCH`. Architecture mismatch, GPU does " - "not support requested feature."; - case CURAND_STATUS_INTERNAL_ERROR: - return "`CURAND_STATUS_INTERNAL_ERROR`. Internal library error."; - default: - return "Unknown curand status"; - } +/*************** CURAND ERROR ***************/ +inline bool is_error(curandStatus_t stat) { + return stat != CURAND_STATUS_SUCCESS; } inline std::string build_nvidia_error_msg(curandStatus_t stat) { - std::string msg(" Curand error, "); - return msg + curandGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CURAND error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); } -/***** CUDNN ERROR *****/ +/*************** CUDNN ERROR ***************/ inline bool is_error(cudnnStatus_t stat) { return stat != CUDNN_STATUS_SUCCESS; } inline std::string build_nvidia_error_msg(cudnnStatus_t stat) { - std::string msg(" Cudnn error, "); - return msg + platform::dynload::cudnnGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CUDNN error(" << stat << "), " + << platform::dynload::cudnnGetErrorString(stat) << ". " + << GetExternalErrorMsg(stat); + return sout.str(); } -/***** CUBLAS ERROR *****/ +/*************** CUBLAS ERROR ***************/ inline bool is_error(cublasStatus_t stat) { return stat != CUBLAS_STATUS_SUCCESS; } -inline const char* cublasGetErrorString(cublasStatus_t stat) { - switch (stat) { - case CUBLAS_STATUS_NOT_INITIALIZED: - return "`CUBLAS_STATUS_NOT_INITIALIZED`. The cuBLAS library was not " - "initialized."; - case CUBLAS_STATUS_ALLOC_FAILED: - return "`CUBLAS_STATUS_ALLOC_FAILED`. Resource allocation failed inside " - "the cuBLAS library."; - case CUBLAS_STATUS_INVALID_VALUE: - return "`CUBLAS_STATUS_INVALID_VALUE`. An unsupported value or parameter " - "was passed to the function (a negative vector size, for " - "example)."; - case CUBLAS_STATUS_ARCH_MISMATCH: - return "`CUBLAS_STATUS_ARCH_MISMATCH`. The function requires a feature " - "absent from the device architecture; usually caused by the lack " - "of support for double precision."; - case CUBLAS_STATUS_MAPPING_ERROR: - return "`CUBLAS_STATUS_MAPPING_ERROR`. An access to GPU memory space " - "failed, which is usually caused by a failure to bind a texture."; - case CUBLAS_STATUS_EXECUTION_FAILED: - return "`CUBLAS_STATUS_EXECUTION_FAILED`. The GPU program failed to " - "execute. This is often caused by a launch failure of the kernel " - "on the GPU, which can be caused by multiple reasons."; - case CUBLAS_STATUS_INTERNAL_ERROR: - return "`CUBLAS_STATUS_INTERNAL_ERROR`. An internal cuBLAS operation " - "failed. This error is usually caused by a cudaMemcpyAsync() " - "failure."; - case CUBLAS_STATUS_NOT_SUPPORTED: - return "`CUBLAS_STATUS_NOT_SUPPORTED`. The functionality requested is " - "not supported."; - case CUBLAS_STATUS_LICENSE_ERROR: - return "`CUBLAS_STATUS_LICENSE_ERROR`. The functionality requested " - "requires some license and an error was detected when trying to " - "check the current licensing."; - default: - return "Unknown cublas status"; - } -} - inline std::string build_nvidia_error_msg(cublasStatus_t stat) { - std::string msg(" Cublas error, "); - return msg + cublasGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CUBLAS error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); } -/***** CUSOLVER ERROR *****/ +/*************** CUSOLVER ERROR ***************/ inline bool is_error(cusolverStatus_t stat) { return stat != CUSOLVER_STATUS_SUCCESS; } -inline const char* cusolverGetErrorString(cusolverStatus_t stat) { - switch (stat) { - case CUSOLVER_STATUS_NOT_INITIALIZED: - return "`CUSOLVER_STATUS_NOT_INITIALIZED`. The cuSolver library was not " - "initialized. This is usually caused by the lack of a prior call, " - "an error in the CUDA Runtime API called by the cuSolver routine, " - "or an error in the hardware setup."; - case CUSOLVER_STATUS_ALLOC_FAILED: - return "`CUSOLVER_STATUS_ALLOC_FAILED`. Resource allocation failed " - "inside the cuSolver library. This is usually caused by a " - "cudaMalloc() failure."; - case CUSOLVER_STATUS_INVALID_VALUE: - return "`CUSOLVER_STATUS_INVALID_VALUE`. An unsupported value or " - "parameter was passed to the function (a negative vector size, " - "for example)."; - case CUSOLVER_STATUS_ARCH_MISMATCH: - return "`CUSOLVER_STATUS_ARCH_MISMATCH`. The function requires a feature " - "absent from the device architecture; usually caused by the lack " - "of support for atomic operations or double precision."; - case CUSOLVER_STATUS_EXECUTION_FAILED: - return "`CUSOLVER_STATUS_EXECUTION_FAILED`. The GPU program failed to " - "execute. This is often caused by a launch failure of the kernel " - "on the GPU, which can be caused by multiple reasons."; - case CUSOLVER_STATUS_INTERNAL_ERROR: - return "`CUSOLVER_STATUS_INTERNAL_ERROR`. An internal cuSolver operation " - "failed. This error is usually caused by a cudaMemcpyAsync() " - "failure."; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "`CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED`. The matrix type is " - "not supported by this function. This is usually caused by " - "passing an invalid matrix descriptor to the function."; - default: - return "Unknown cusolver status"; - } -} - inline std::string build_nvidia_error_msg(cusolverStatus_t stat) { - std::string msg(" Cublas error, "); - return msg + cusolverGetErrorString(stat) + " "; + std::ostringstream sout; + sout << "CUSOLVER error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); } -/****** NCCL ERROR ******/ +/**************** NCCL ERROR ****************/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { return nccl_result != ncclSuccess; } inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { - std::string msg(" Nccl error, "); + std::ostringstream sout; + sout << "NCCL error(" << nccl_result << "), " + << platform::dynload::ncclGetErrorString(nccl_result) << ". "; if (errno == ENOSPC || errno == EAGAIN) { std::string detail(strerror(errno)); detail += "\nPlease try one of the following solutions:"; @@ -947,42 +910,19 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) { "\n3. Increase shared memory by setting the -shm-size " "option when starting docker container, e.g., setting " " -shm-size=2g.\n"; - return msg + platform::dynload::ncclGetErrorString(nccl_result) + - ", detail: " + detail + " "; + sout << " Detail: " + detail; } - return msg + platform::dynload::ncclGetErrorString(nccl_result) + " "; + sout << GetExternalErrorMsg(nccl_result); + return sout.str(); } #endif // not(__APPLE__) and PADDLE_WITH_NCCL -namespace details { - -template -struct CudaStatusType {}; - -#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \ - template <> \ - struct CudaStatusType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ - } - -DEFINE_CUDA_STATUS_TYPE(cudaError_t, cudaSuccess); -DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS); - -#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) -DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); -#endif -} // namespace details - #define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ do { \ auto __cond__ = (COND); \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \ auto __summary__ = ::paddle::platform::errors::External( \ @@ -991,6 +931,16 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); } \ } while (0) +#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP) \ + do { \ + auto res = cudaGetLastError(); \ + if (UNLIKELY(res != cudaSuccess)) { \ + auto msg = ::paddle::platform::build_nvidia_error_msg(res); \ + PADDLE_THROW(platform::errors::Fatal("CUDA error after kernel (%s): %s", \ + OP, msg)); \ + } \ + } while (0) + inline void retry_sleep(unsigned milliseconds) { #ifdef _WIN32 Sleep(milliseconds); @@ -1013,7 +963,7 @@ inline void retry_sleep(unsigned milliseconds) { int retry_count = 1; \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ retry_sleep(FLAGS_gpu_allocator_retry_time); \ @@ -1027,10 +977,11 @@ inline void retry_sleep(unsigned milliseconds) { } \ } while (0) -#undef DEFINE_CUDA_STATUS_TYPE +#undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_CUDA -/** HIP PADDLE ENFORCE FUNCTIONS AND MACROS **/ +/**************************************************************************/ +/***************************** HIP ERROR **********************************/ #ifdef PADDLE_WITH_HIP /***** HIP ERROR *****/ @@ -1042,7 +993,7 @@ inline std::string build_rocm_error_msg(hipError_t e) { return sout.str(); } -/** HIPRAND ERROR **/ +/***** HIPRAND ERROR *****/ inline bool is_error(hiprandStatus_t stat) { return stat != HIPRAND_STATUS_SUCCESS; } @@ -1143,22 +1094,22 @@ inline std::string build_rocm_error_msg(ncclResult_t nccl_result) { namespace details { template -struct CudaStatusType {}; +struct ExternalApiType {}; -#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \ - template <> \ - struct CudaStatusType { \ - using Type = type; \ - static constexpr Type kSuccess = success_value; \ +#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \ + template <> \ + struct ExternalApiType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ } -DEFINE_CUDA_STATUS_TYPE(hipError_t, hipSuccess); -DEFINE_CUDA_STATUS_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); -DEFINE_CUDA_STATUS_TYPE(miopenStatus_t, miopenStatusSuccess); -DEFINE_CUDA_STATUS_TYPE(rocblas_status, rocblas_status_success); +DEFINE_EXTERNAL_API_TYPE(hipError_t, hipSuccess); +DEFINE_EXTERNAL_API_TYPE(hiprandStatus_t, HIPRAND_STATUS_SUCCESS); +DEFINE_EXTERNAL_API_TYPE(miopenStatus_t, miopenStatusSuccess); +DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success); #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL) -DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); +DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess); #endif } // namespace details @@ -1168,7 +1119,7 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); auto __cond__ = (COND); \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ if (UNLIKELY(__cond__ != __success_type__)) { \ auto __summary__ = ::paddle::platform::errors::External( \ @@ -1191,7 +1142,7 @@ inline void retry_sleep(unsigned millisecond) { int retry_count = 1; \ using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ constexpr auto __success_type__ = \ - ::paddle::platform::details::CudaStatusType< \ + ::paddle::platform::details::ExternalApiType< \ __CUDA_STATUS_TYPE__>::kSuccess; \ while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \ retry_sleep(FLAGS_gpu_allocator_retry_time); \ @@ -1205,7 +1156,7 @@ inline void retry_sleep(unsigned millisecond) { } \ } while (0) -#undef DEFINE_CUDA_STATUS_TYPE +#undef DEFINE_EXTERNAL_API_TYPE #endif // PADDLE_WITH_HIP #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 39f3d3f00c9997eea3f4ab1e5652fcc78f1be0a6..95a852ad6e92a3ec2f8ecc08f5378ed91301f3c3 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -304,6 +304,7 @@ bool CheckCudaStatusFailure(T value, const std::string& msg) { return false; } catch (paddle::platform::EnforceNotMet& error) { std::string ex_msg = error.what(); + std::cout << ex_msg << std::endl; return ex_msg.find(msg) != std::string::npos; } } @@ -338,29 +339,96 @@ TEST(enforce, hip_success) { #else TEST(enforce, cuda_success) { EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "Cuda error")); - EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "Cuda error")); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue, "CUDA error")); + + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation, "CUDA error")); + + EXPECT_TRUE(CheckCudaStatusFailure( + cudaErrorInsufficientDriver, + "This indicates that the installed NVIDIA CUDA driver is older than the " + "CUDA runtime library. This is not a supported configuration.Users " + "should install an updated NVIDIA display driver to allow the " + "application to run")); + EXPECT_TRUE(CheckCudaStatusFailure( + cudaErrorContextIsDestroyed, + "This error indicates that the context current to the calling thread has " + "been destroyed using cuCtxDestroy, or is a primary context which has " + "not yet been initialized")); EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS)); EXPECT_TRUE( - CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "Curand error")); + CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH, "CURAND error")); EXPECT_TRUE( - CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "Curand error")); + CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED, "CURAND error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CURAND_STATUS_ARCH_MISMATCH, + "Architecture mismatch, GPU does not support requested feature")); + EXPECT_TRUE( + CheckCudaStatusFailure(CURAND_STATUS_LENGTH_NOT_MULTIPLE, + "Length requested is not a multple of dimension")); EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS)); EXPECT_TRUE( - CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "Cudnn error")); - EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "Cudnn error")); + CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED, "CUDNN error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED, "CUDNN error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUDNN_STATUS_BAD_PARAM, + "An incorrect value or parameter was passed to the function. To correct, " + "ensure that all the parameters being passed have valid values")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUDNN_STATUS_LICENSE_ERROR, + "The functionality requested requires some license and an error was " + "detected when trying to check the current licensing. This error can " + "happen if the license is not present or is expired or if the " + "environment variable NVIDIA_LICENSE_FILE is not set properly")); EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); EXPECT_TRUE( - CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "Cublas error")); + CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS error")); + EXPECT_TRUE( + CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUBLAS_STATUS_EXECUTION_FAILED, + "The GPU program failed to execute. This is often caused by a launch " + "failure of the kernel on the GPU, which can be caused by multiple " + "reasons. To correct: check that the hardware, an appropriate version " + "of the driver, and the cuBLAS library are correctly installed")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUBLAS_STATUS_MAPPING_ERROR, + "An access to GPU memory space failed, which is usually caused by a " + "failure to bind a texture. To correct: prior to the function call, " + "unbind any previously bound textures")); + + EXPECT_TRUE(CheckCudaStatusSuccess(CUSOLVER_STATUS_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(CUSOLVER_STATUS_NOT_INITIALIZED, + "CUSOLVER error")); EXPECT_TRUE( - CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE, "Cublas error")); + CheckCudaStatusFailure(CUSOLVER_STATUS_ALLOC_FAILED, "CUSOLVER error")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUSOLVER_STATUS_INTERNAL_ERROR, + "An internal cuSolver operation failed. This error is usually caused by " + "a cudaMemcpyAsync() failure.To correct: check that the hardware, an " + "appropriate version of the driver, and the cuSolver library are " + "correctly installed. Also, check that the memory passed as a parameter " + "to the routine is not being deallocated prior to the routine’s " + "completion")); + EXPECT_TRUE(CheckCudaStatusFailure( + CUSOLVER_STATUS_INVALID_VALUE, + "An unsupported value or parameter was passed to the function (a " + "negative vector size, for example).To correct: ensure that all the " + "parameters being passed have valid values")); + #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); - EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Nccl error")); - EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Nccl error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError, + "An internal check failed. This is either " + "a bug in NCCL or due to memory " + "corruption")); + EXPECT_TRUE(CheckCudaStatusFailure(ncclInvalidUsage, + "The call to NCCL is incorrect. This is " + "usually reflecting a programming error")); #endif } #endif diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 0985b884d1daf727ccabf76a3040a1576f2f96b7..3a81cfab865c2835d02e031dc6b3d0128ecba2a9 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -40,7 +40,7 @@ class Event { // The DeviceContext is used to get the cuda stream. // If CPU profiling mode, can pass nullptr. Event(EventType type, std::string name, uint32_t thread_id, - EventRole role = EventRole::kOrdinary); + EventRole role = EventRole::kOrdinary, std::string attr = "none"); const EventType& type() const; Event* parent() const { return parent_; } @@ -50,7 +50,7 @@ class Event { uint32_t thread_id() const { return thread_id_; } void set_name(std::string name) { name_ = name; } void set_role(EventRole role) { role_ = role; } - + std::string attr() const { return attr_; } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifndef PADDLE_WITH_CUPTI gpuEvent_t event() const { return event_; } @@ -69,6 +69,7 @@ class Event { EventRole role_{}; int64_t cpu_ns_; bool visited_status_{false}; + std::string attr_; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #ifdef PADDLE_WITH_CUPTI int64_t gpu_ns_ = 0; diff --git a/paddle/fluid/platform/cuda_error.proto b/paddle/fluid/platform/external_error.proto similarity index 58% rename from paddle/fluid/platform/cuda_error.proto rename to paddle/fluid/platform/external_error.proto index b55e0af81ee6f8fb47d558287c7f902ef0fde81b..2094de7e10f69e98cc450d4221a85c6f904770ed 100644 --- a/paddle/fluid/platform/cuda_error.proto +++ b/paddle/fluid/platform/external_error.proto @@ -15,21 +15,32 @@ limitations under the License. */ syntax = "proto2"; package paddle.platform.proto; +// (NOTE:zhouwei): ApiType describes which kind of external third party API +// More external third party API can be added. +enum ApiType { + CUDA = 0; + CURAND = 1; + CUDNN = 2; + CUBLAS = 3; + CUSOLVER = 4; + NCCL = 5; +} + message MessageDesc { - // Indicates the type of error - required int32 errorCode = 1; + // Indicates the code of error + required int32 code = 1; // Indicates the message of error - required string errorMessage = 2; + required string message = 2; } message AllMessageDesc { - // Version of cuda API - required int32 version = 1; + // Indicates which kind of third-party API + required ApiType type = 1; // Error messages of different errortype - repeated MessageDesc Messages = 2; + repeated MessageDesc messages = 2; } -message cudaerrorDesc { - // Error messages of different cuda versions(9.0/10.0/10.2) - repeated AllMessageDesc AllMessages = 2; +message ExternalErrorDesc { + // Error messages of different kind of external third party API + repeated AllMessageDesc errors = 1; } \ No newline at end of file diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h new file mode 100644 index 0000000000000000000000000000000000000000..c6c22bb2f9203b00e924f06f6fe4bf1b0b4ffc65 --- /dev/null +++ b/paddle/fluid/platform/fast_divmod.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.1 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.1 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/hostdevice.h" + +#define INT_BITS 32 + +namespace paddle { +namespace operators { + +template +struct alignas(sizeof(T) * Size) CudaAlignedVector { + T val[Size]; +}; + +struct FastDivMod { + // 1st value represents the result of input number divides by recorded divisor + // 2nd value represents the result of input number modulo by recorded divisor + using DivModT = CudaAlignedVector; + + FastDivMod() {} + HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) { + static_assert(sizeof(unsigned int) == 4, + "Only Support 32-bit unsigned int."); + + for (shift_val = 0; shift_val < INT_BITS; ++shift_val) { + auto shift_limit = 1 << shift_val; + if (shift_limit >= divisor) break; + } + uint64_t long_one = 1; + uint64_t temp_div = + ((long_one << INT_BITS) * ((long_one << shift_val) - divisor)) / + divisor + + 1; + multiplier = temp_div; + } + + __device__ __forceinline__ uint32_t Div(uint32_t n) const { + uint32_t t = __umulhi(n, multiplier); + return (t + n) >> shift_val; + } + + __device__ __forceinline__ DivModT Divmod(uint32_t n) const { + uint32_t q = Div(n); + DivModT result = {q, n - q * divisor}; + return result; + } + + int32_t divisor; + int32_t shift_val; + uint32_t multiplier; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 83b9544d23267be9de80ce9cd054a9b40bf892aa..1d76c2ea584b7e393da2bee6e0dd41731463eb81 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -578,6 +578,19 @@ DEFINE_string(tracer_mkldnn_ops_on, "", DEFINE_string(tracer_mkldnn_ops_off, "", "List of OneDNN operation types to be turned off"); +/** + * Debug related FLAG + * Name: check_kernel_launch + * Since Version: 2.1.0 + * Value Range: bool, default=false + * Example: + * Note: Check kernel launch status after every kernel compute. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DEFINE_bool(check_kernel_launch, false, + "Check kernel launch status after every kernel compute"); +#endif + /** * CUDNN related FLAG * Name: conv2d_disable_cudnn diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index f38603e80fb115f3131173c36f0ee2962d06c0de..5f6dd5679a1a8eacc270a17e0f725e4311897dda 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/gen_comm_id_helper.h" #include @@ -33,6 +33,10 @@ limitations under the License. */ #include "xpu/bkcl.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#endif + namespace paddle { namespace platform { @@ -262,10 +266,17 @@ static int ConnectAddr(const std::string& ep, const char* head) { return sock; } +// TODO(WANGXI): maybe need to unify this hard code +#ifdef PADDLE_WITH_ASCEND_CL +#define MAX_COMMUNIQUEID_LEN 4108 +#else +#define MAX_COMMUNIQUEID_LEN 1024 +#endif + template static void RecvCommID(int conn, CommUniqueId* nccl_id) { - char buffer[1024] = {0}; - static_assert(sizeof(CommUniqueId) <= 1024, + char buffer[MAX_COMMUNIQUEID_LEN] = {0}; + static_assert(sizeof(CommUniqueId) <= MAX_COMMUNIQUEID_LEN, "nccl id bytes must <= buffer size"); CHECK_SYS_CALL(SocketRecv(conn, buffer, sizeof(CommUniqueId)), @@ -275,7 +286,7 @@ static void RecvCommID(int conn, CommUniqueId* nccl_id) { template static void SendCommID(int conn, CommUniqueId* nccl_id) { - char buffer[1024] = {0}; + char buffer[MAX_COMMUNIQUEID_LEN] = {0}; memcpy(buffer, nccl_id, sizeof(CommUniqueId)); CHECK_SYS_CALL(SocketSend(conn, buffer, sizeof(CommUniqueId)), @@ -361,6 +372,9 @@ INSTANT_TEMPLATE(ncclUniqueId) #ifdef PADDLE_WITH_XPU_BKCL INSTANT_TEMPLATE(BKCLUniqueId) #endif +#ifdef PADDLE_WITH_ASCEND_CL +INSTANT_TEMPLATE(HcclRootInfo) +#endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h index c51c5ac6c8ac7bc8a8887c39c0b08d8cd0af4540..fb5d8d8fcd94059cbef66de809bca295d205a73c 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.h +++ b/paddle/fluid/platform/gen_comm_id_helper.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_XPU_BKCL) + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) #include #include #include diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h index 6c265677d63e99c173b7fdce8de362dc9b381352..4da91b4e764a5285b005ebc459c4dfa4e52df9cd 100644 --- a/paddle/fluid/platform/gpu_launch_config.h +++ b/paddle/fluid/platform/gpu_launch_config.h @@ -37,6 +37,7 @@ struct GpuLaunchConfig { dim3 theory_thread_count = dim3(1, 1, 1); dim3 thread_per_block = dim3(1, 1, 1); dim3 block_per_grid = dim3(1, 1, 1); + int compute_capability = 0; }; inline GpuLaunchConfig GetGpuLaunchConfig1D( @@ -67,11 +68,14 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D( std::min(max_threads, context.GetMaxThreadsPerBlock()); const int block_count = std::min(DivUp(physical_thread_count, thread_per_block), sm); + // Get compute_capability + const int capability = context.GetComputeCapability(); GpuLaunchConfig config; config.theory_thread_count.x = theory_thread_count; config.thread_per_block.x = thread_per_block; config.block_per_grid.x = block_count; + config.compute_capability = capability; return config; } diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 35776b9f1e6b88658fcefed015f0dc152a51d8bc..0b683a742c9fd8094e91c54d4f323120bad1eaca 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -135,13 +135,14 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, return mkldnn::memory::desc({dims}, data_type, format); } -inline void ClearMKLDNNCache(const platform::Place& place) { +inline void ClearMKLDNNCache(const platform::Place& place, + void* ptr = nullptr) { // Clear mkl-dnn cache, if (platform::is_cpu_place(place)) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::MKLDNNDeviceContext* dev_ctx = (platform::MKLDNNDeviceContext*)pool.Get(place); - dev_ctx->ResetBlobMap(); + dev_ctx->ResetBlobMap(ptr); platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout( paddle::framework::DataLayout::kNCHW); } @@ -452,6 +453,9 @@ inline void AttachPointerHashToMKLDNNKey(void* ptr, paddle::platform::MKLDNNDeviceContext::tls().set_key_suffix( "E" + std::to_string(reinterpret_cast(ptr))); } + // Let's register adress of current executor + paddle::platform::MKLDNNDeviceContext::tls().set_curr_exec(ptr); + // For first thread if (first_thread == ThreadIDasStr()) { paddle::platform::MKLDNNDeviceContext::tls().disable_tid_in_key(); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 54efa55cc4cd9da7d5a0b868093adee74b4fe002..58622fb2529b830ed222284296153dd4b55c1cf8 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -35,7 +35,8 @@ using user_function = std::function(const float*)>; using memory = mkldnn::memory; template + typename TBackward = mkldnn_dummy_primitive, + typename TBackward_params = mkldnn_dummy_primitive> class MKLDNNHandlerT { public: MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, @@ -72,6 +73,21 @@ class MKLDNNHandlerT { return backward_p; } + std::shared_ptr AcquireBackwardWeightsPrimitive() { + const std::string key_p = key_ + "@bwd_w_p"; + auto backward_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); + if (backward_p == nullptr) { + PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( + "Error: BWD_PD should be set when " + "getting BWD prim witk key: %s .", + key_p)); + backward_p = std::make_shared(*bwd_w_pd_); + dev_ctx_.SetBlob(key_p, backward_p); + } + return backward_p; + } + std::shared_ptr AcquireSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); @@ -116,23 +132,55 @@ class MKLDNNHandlerT { "@diff_src_mem_p"); } + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + framework::Tensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr, + "@diff_wei_mem_p"); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "Error: BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + "@diff_wei_mem_p"); + } + protected: bool isCached() { - const std::string key_pd = key_common_ + "@fwd_pd"; + const std::string key_pd = key_ + "@fwd_pd"; fwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); - const std::string key_p = key_ + "@fwd_p"; - return (dev_ctx_.GetBlob(key_p) != nullptr); + return (fwd_pd_ != nullptr); } bool isBwdCached() { - const std::string key_pd = key_common_ + "@bwd_pd"; + const std::string key_pd = key_ + "@bwd_pd"; bwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); - const std::string key_p = key_ + "@bwd_p"; - return (dev_ctx_.GetBlob(key_p) != nullptr); + if (bwd_pd_ == nullptr) { + return false; + } else { + // When BWD is cached then still we need to Get FWD PD + const std::string key_fpd = key_ + "@fwd_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_fpd)); + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, platform::errors::Unavailable( + "Error: FWD PD should be set when BWD PD is cached.")); + return true; + } } // If your primitive descriptor requires attributes, pass them as a @@ -141,23 +189,14 @@ class MKLDNNHandlerT { // constructor, including the first one. template void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - // Forward PD has to be passed to Grad op that - // may be executed by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_pd = key_common_ + "@fwd_pd"; + // This is used when we can recreate FWD PD in BWD so + // we do not need to pass FWD to BWD + const std::string key_pd = key_ + "@fwd_pd"; fwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); if (fwd_pd_ == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_pd)); - if (fwd_pd_ == nullptr) { - CreateForwardPrimitiveDescriptor(first_arg, - std::forward(args)...); - dev_ctx_.SetBlob(key_pd, fwd_pd_); - } + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + dev_ctx_.SetBlob(key_pd, fwd_pd_); } } @@ -184,12 +223,12 @@ class MKLDNNHandlerT { template void AcquireBackwardPrimitiveDescriptor(Args&&... args) { - const std::string key_fwd_pd = key_common_ + "@fwd_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_fwd_pd)); + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor PADDLE_ENFORCE_NOT_NULL( - fwd_pd_, platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.", key_fwd_pd)); + fwd_pd_, + platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.", + key_ + "@fwd_pd")); const std::string key_pd = key_ + "@bwd_pd"; bwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); @@ -201,6 +240,27 @@ class MKLDNNHandlerT { } } + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.", + key_ + "@fwd_pd")); + const std::string key_pd = key_ + "@bwd_w_pd"; + bwd_w_pd_ = + std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (bwd_w_pd_ == nullptr) { + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + dev_ctx_.SetBlob(key_pd, bwd_w_pd_); + } + } + std::shared_ptr AcquireMemoryFromPrimitive( const std::string& suffix) { return std::static_pointer_cast( @@ -328,6 +388,7 @@ class MKLDNNHandlerT { std::string key_; std::shared_ptr fwd_pd_; std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; }; // TODO(grygielski) this class will be deleted later. @@ -538,17 +599,8 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { const std::string& uniq_name) : platform::MKLDNNHandlerT( dev_ctx, engine, cpu_place, - platform::CreateKey( - dev_ctx, framework::vectorize(x->dims()), uniq_name, - (algo == dnnl::algorithm::binary_mul ? "M" : ""))) { - // bradcasting combined with in-place may require - auto rankdiff = x->dims().size() - y->dims().size(); - if (rankdiff > 0) { - auto suffix = std::to_string(rankdiff); - this->key_ += suffix; - this->key_common_ += suffix; - } - + platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), + uniq_name)) { if (!this->isCached()) { PADDLE_ENFORCE_EQ( x->layout(), DataLayout::kMKLDNN, @@ -568,18 +620,24 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { const auto src_y_tz = framework::vectorize(y->dims()); // if output tensor(z) is nullptr then we are computing into oneDNN // managed buffer - const auto dst_tz = - (z == nullptr) ? src_x_tz : framework::vectorize(z->dims()); + auto rankdiff = x->dims().size() - y->dims().size(); + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); - const auto src0_md = dnnl::memory::desc( + auto src0_md = dnnl::memory::desc( src_x_tz, platform::MKLDNNGetDataType(), x->format()); auto src1_md = dnnl::memory::desc( src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { + if (rankdiff > 0) { // Second input is of smaller rank than first std::vector dims1_ex(rankdiff, 1); dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), src_y_tz.begin(), src_y_tz.end()); src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); } const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::any); @@ -639,7 +697,8 @@ class BroadcastDataMKLDNNHandler const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, float scale_x, float scale_y, - const std::string& uniq_name) + const std::string& uniq_name, + const std::vector& input_dims) : platform::MKLDNNHandlerT( dev_ctx, engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), @@ -659,24 +718,12 @@ class BroadcastDataMKLDNNHandler y->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument("Wrong format set for Y tensor.")); - auto src1_tz = framework::vectorize(y->dims()); const auto src0_tz = framework::vectorize(x->dims()); - // GetExpectedKernelType checks if smaller vector is a subvector with all - // the dims in correct order on the rightmost part of the bigger vector, - // i.e. a correct vector for broadcasting: - // x = 5, 7, 3, 2, 4, 8 - // y = 4, 8 - src1_tz.reserve(src0_tz.size()); - - for (size_t i = src1_tz.size(); i < src0_tz.size(); ++i) { - src1_tz.insert(src1_tz.begin(), 1L); - } - const auto src0_md = dnnl::memory::desc( src0_tz, platform::MKLDNNGetDataType(), x->format()); const auto src1_md = dnnl::memory::desc( - src1_tz, platform::MKLDNNGetDataType(), x->format()); + input_dims, platform::MKLDNNGetDataType(), x->format()); dnnl::primitive_attr attributes; attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); @@ -711,7 +758,7 @@ class ReductionMKLDNNHandler const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, const std::string& uniq_name, - std::vector output_dims) + std::vector y_tz) : platform::MKLDNNHandlerT( dev_ctx, engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), @@ -725,14 +772,14 @@ class ReductionMKLDNNHandler x->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument("Wrong format set for X tensor.")); - const auto src_tz = framework::vectorize(x->dims()); + const auto x_tz = framework::vectorize(x->dims()); - const auto src_md = dnnl::memory::desc( - src_tz, platform::MKLDNNGetDataType(), x->format()); - const auto dst_md = memory::desc( - output_dims, platform::MKLDNNGetDataType(), x->format()); + const auto x_md = dnnl::memory::desc( + x_tz, platform::MKLDNNGetDataType(), x->format()); + const auto y_md = + memory::desc(y_tz, platform::MKLDNNGetDataType(), x->format()); - this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps); + this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps); } } }; @@ -742,45 +789,100 @@ class ActivationMKLDNNHandler : public MKLDNNHandlerT { public: - ActivationMKLDNNHandler(const std::vector& dims, - mkldnn::algorithm algorithm, float alpha, float beta, - const MKLDNNMemoryFormat fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, + ActivationMKLDNNHandler(mkldnn::algorithm algorithm, + const framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, Place cpu_place, + const framework::Tensor* in_x, const std::string& unique_name, bool is_inplaced) - : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - is_inplaced - ? platform::CreateKey(dev_ctx, dims, "a", algorithm, - unique_name) - : platform::CreateKey(dev_ctx, dims, "a", unique_name)) { - auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - - this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, - algorithm, md, alpha, beta); - } - - ActivationMKLDNNHandler(const std::vector& dims, - mkldnn::algorithm algorithm, float alpha, float beta, - const MKLDNNMemoryFormat fmt, - const MKLDNNMemoryFormat diff_fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, - const std::string& unique_name) + is_inplaced ? platform::CreateKey( + dev_ctx, framework::vectorize(in_x->dims()), "a", + algorithm, unique_name) + : platform::CreateKey( + dev_ctx, framework::vectorize(in_x->dims()), "a", + unique_name)) { + if (!this->isCached()) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } + } + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); + + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = + src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), + src_fmt); + + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); + } + } + + ActivationMKLDNNHandler(mkldnn::algorithm algorithm, + const framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, Place cpu_place, + const framework::Tensor* in_x, const Tensor* out_grad, + const std::string& unique_name) : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, "a", unique_name)) { - auto diff_dst_md = platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = - platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType(), fmt); + platform::CreateKey(dev_ctx, framework::vectorize(in_x->dims()), + "a", unique_name)) { + if (!this->isBwdCached()) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } - this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, - alpha, beta); + auto diff_dst_tz = framework::vectorize(out_grad->dims()); + + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); + + auto dims = framework::vectorize(in_x->dims()); + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); + + this->AcquireForwardPrimitiveDescriptor( + mkldnn::prop_kind::forward_training, algorithm, src_md, alpha, beta); + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); + } } std::shared_ptr AcquireBackwardSrcMemory( @@ -792,82 +894,6 @@ class ActivationMKLDNNHandler } }; -template -class LRNMKLDNNHandler - : public MKLDNNHandlerT { - public: - LRNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, - const platform::MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, - platform::Place cpu_place, const Tensor* input, - const std::string& unique_name) - - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)) { - if (!this->isCached()) { - const int n = ctx.Attr("n"); - // MKL-DNN implements LRN in a caffe way: - // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html - // Where sum of squares is divided by size of normalization window - // this is not the case for PaddlePaddle LRN. - // Hence we need to compensate for this diffrence by - // multipliing alpha by size of window(n) - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); - bool is_test = ctx.Attr("is_test"); - - auto dims = paddle::framework::vectorize(input->dims()); - - auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); - } - } - - LRNMKLDNNHandler(const std::vector& dims, const int n, - const float alpha, const float beta, const float k, - const MKLDNNMemoryFormat fmt, - const MKLDNNMemoryFormat diff_fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, const std::string& unique_name) - - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, unique_name)) { - auto src_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - auto diff_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); - - this->AcquireBackwardPrimitiveDescriptor( - mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, beta, - k); - } - - std::shared_ptr AcquireWorkspaceMemory( - framework::Tensor* workspace) { - T* ptr = workspace->mutable_data( - this->place_, this->fwd_pd_->workspace_desc().get_size()); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), - ptr, "@wrk_mem_p"); - } - - std::shared_ptr AcquireBackwardWorkspaceMemory( - const framework::Tensor* workspace) { - const T* workspace_data = workspace->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), - to_void_cast(workspace_data), - "@bwd-wrk_mem_p"); - } -}; - template class TransposeMKLDNNHandler : public MKLDNNHandler { public: @@ -971,13 +997,50 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), vtype_(vtype), - dtype_(dtype) {} + vtype_dst_(vtype), + dtype_(dtype), + dtype_dst_(dtype) {} + + ReorderMKLDNNHandler(std::vector& dims, // NOLINT + framework::proto::VarType::Type vtype, + mkldnn::memory::data_type dtype, + framework::proto::VarType::Type vtype_dst, + mkldnn::memory::data_type dtype_dst, + const platform::MKLDNNDeviceContext& dev_ctx, + mkldnn::engine engine, const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + dims_(dims), + vtype_(vtype), + vtype_dst_(vtype_dst), + dtype_(dtype), + dtype_dst_(dtype_dst) {} std::shared_ptr AcquireSrcMemory( const MKLDNNMemoryFormat& fmt, void* ptr) { return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p"); } + std::shared_ptr AcquireSrcSubmemory( + const std::vector& dims, const std::vector& offset, + const std::shared_ptr& mem_p, int submemory_number) { + std::string local_key = key_; + local_key.append("@submem") + .append(std::to_string(submemory_number)) + .append("_p"); + + auto sub_mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (sub_mem_p == nullptr) { + auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset}); + sub_mem_p = std::make_shared(sub_md, engine_, + mem_p->get_data_handle()); + dev_ctx_.SetBlob(local_key, sub_mem_p); + } else { + sub_mem_p->set_data_handle(mem_p->get_data_handle()); + } + return sub_mem_p; + } + std::shared_ptr AcquireDstMemory( framework::Tensor* output, const MKLDNNMemoryFormat& fmt, platform::Place place) { @@ -985,20 +1048,59 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt); - auto dst_data = output->mutable_data(place, vtype_, dst_md.get_size()); + auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt); + auto dst_data = + output->mutable_data(place, vtype_dst_, dst_md.get_size()); + + mem_p = std::make_shared(dst_md, engine_, dst_data); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + // Even if memory object exists , we may be using it for diffrent tensor + auto dst_data = + output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size()); + mem_p->set_data_handle(dst_data); + } + return mem_p; + } + + std::shared_ptr AcquireDstMemory( + framework::Tensor* output, const std::vector& dims, + const int memory_number, const MKLDNNMemoryFormat& fmt, + platform::Place place) { + auto local_key = + key_ + "@user_dst_mem" + std::to_string(memory_number) + "_p"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt); + auto dst_data = + output->mutable_data(place, vtype_dst_, dst_md.get_size()); mem_p = std::make_shared(dst_md, engine_, dst_data); dev_ctx_.SetBlob(local_key, mem_p); } else { // Even if memory object exists , we may be using it for diffrent tensor auto dst_data = - output->mutable_data(place, vtype_, mem_p->get_desc().get_size()); + output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size()); mem_p->set_data_handle(dst_data); } return mem_p; } + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p, int reorder_number) { + auto prim_key = key_ + "@reorder" + std::to_string(reorder_number) + "_p"; + auto reorder_p = + std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); + if (reorder_p == nullptr) { + reorder_p = + std::make_shared(*(src_memory_p), *(dst_memory_p)); + dev_ctx_.SetBlob(prim_key, reorder_p); + } + return reorder_p; + } + std::shared_ptr AcquireReorder( std::shared_ptr dst_memory_p, std::shared_ptr src_memory_p) { @@ -1015,8 +1117,8 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { private: std::vector dims_; - framework::proto::VarType::Type vtype_; - mkldnn::memory::data_type dtype_; + framework::proto::VarType::Type vtype_, vtype_dst_; + mkldnn::memory::data_type dtype_, dtype_dst_; }; template diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 1cc9fd9fe76341cd495a3580cddbff65f5b0e208..14c772d88897f4fa28e7c37a9452b78b637419a2 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -34,6 +34,7 @@ class PlacePrinter : public boost::static_visitor<> { } void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } + void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } private: @@ -62,6 +63,10 @@ bool is_cuda_pinned_place(const Place &p) { return boost::apply_visitor(IsCUDAPinnedPlace(), p); } +bool is_npu_pinned_place(const Place &p) { + return boost::apply_visitor(IsNPUPinnedPlace(), p); +} + bool places_are_same_class(const Place &p1, const Place &p2) { return p1.which() == p2.which(); } diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index f20fac477d0ec4ef40a3544476e223b6ad97fffa..62d30ecc5ce2efdc1e87229843ee39685507d771 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -85,10 +85,19 @@ struct NPUPlace { int device; }; +struct NPUPinnedPlace { + NPUPinnedPlace() {} + + inline bool operator==(const NPUPinnedPlace &) const { return true; } + inline bool operator!=(const NPUPinnedPlace &) const { return false; } + inline bool operator<(const NPUPinnedPlace &) const { return false; } +}; + struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return true; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -97,6 +106,7 @@ struct IsCPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return true; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -105,6 +115,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } }; @@ -113,6 +124,7 @@ struct IsXPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return true; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -121,15 +133,25 @@ struct IsNPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return true; } + bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } +}; + +struct IsNPUPinnedPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return true; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; class Place : public boost::variant { + CUDAPinnedPlace, NPUPinnedPlace> { private: - using PlaceBase = - boost::variant; + using PlaceBase = boost::variant; public: Place() = default; @@ -139,6 +161,8 @@ class Place : public boost::variant(place)); @@ -155,6 +179,7 @@ bool is_xpu_place(const Place &); bool is_npu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); +bool is_npu_pinned_place(const Place &); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); @@ -190,6 +215,17 @@ struct PlaceVisitorWrapper #endif } + typename Visitor::result_type operator()( + const NPUPinnedPlace &npu_pinned) const { +#ifdef PADDLE_WITH_ASCEND_CL + return visitor_(npu_pinned); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with NPU. Cannot visit npu_pinned")); + return typename Visitor::result_type(); +#endif + } + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return visitor_(cuda); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index aef7f8648f8304d127e085364521cd9ded0fb85e..9c33233e1f79ac799d5acc2a711119d279a9613d 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -32,8 +32,12 @@ namespace platform { MemEvenRecorder MemEvenRecorder::recorder; Event::Event(EventType type, std::string name, uint32_t thread_id, - EventRole role) - : type_(type), name_(name), thread_id_(thread_id), role_(role) { + EventRole role, std::string attr) + : type_(type), + name_(name), + thread_id_(thread_id), + role_(role), + attr_(attr) { cpu_ns_ = GetTimeInNsec(); } @@ -52,7 +56,8 @@ double Event::CudaElapsedMs(const Event &e) const { #endif } -RecordEvent::RecordEvent(const std::string &name, const EventRole role) { +RecordEvent::RecordEvent(const std::string &name, const EventRole role, + const std::string attr) { #ifndef _WIN32 #ifdef PADDLE_WITH_CUDA if (g_enable_nvprof_hook) { @@ -69,7 +74,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { is_enabled_ = true; // lock is not needed, the code below is thread-safe // Maybe need the same push/pop behavior. - Event *e = PushEvent(name, role); + Event *e = PushEvent(name, role, attr); SetCurAnnotation(e); name_ = e->name(); } @@ -186,12 +191,14 @@ void Mark(const std::string &name) { GetEventList().Record(EventType::kMark, name, g_thread_id); } -Event *PushEvent(const std::string &name, const EventRole role) { - return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role); +Event *PushEvent(const std::string &name, const EventRole role, + std::string attr) { + return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role, + attr); } -void PopEvent(const std::string &name, const EventRole role) { - GetEventList().Record(EventType::kPopRange, name, g_thread_id, role); +void PopEvent(const std::string &name, const EventRole role, std::string attr) { + GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr); } void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 2e802bf5ea303c4a4bb75492746b2434bd75f595..512bbc195b5b25dc2f707204b126bcee9af622c1 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -126,7 +126,8 @@ struct MemEvenRecorder { struct RecordEvent { RecordEvent(const std::string& name, - const EventRole role = EventRole::kOrdinary); + const EventRole role = EventRole::kOrdinary, + const std::string attr = "none"); ~RecordEvent(); @@ -200,8 +201,10 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, const Place& place, const std::string& annotation); void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, const Place& place, const std::string& annotation); -Event* PushEvent(const std::string& name, const EventRole role); -void PopEvent(const std::string& name, const EventRole role); +Event* PushEvent(const std::string& name, const EventRole role, + const std::string attr = "none"); +void PopEvent(const std::string& name, const EventRole role, + const std::string attr = "none"); // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents(); diff --git a/paddle/fluid/platform/xpu_header.h b/paddle/fluid/platform/xpu_header.h index 9f2befc123f224aeda3cb4a3d196cbce470d51b2..99f4224b5d408a6450d801ff643f658b74333387 100644 --- a/paddle/fluid/platform/xpu_header.h +++ b/paddle/fluid/platform/xpu_header.h @@ -1,4 +1,4 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include "paddle/fluid/platform/errors.h" +#include "paddle/fluid/platform/float16.h" #include "xpu/api.h" #include "xpu/refactor/fusion.h" #include "xpu/refactor/math.h" @@ -58,4 +59,16 @@ static std::map XPUAPIErrorMsg = { {xpu::Error_t::RUNTIME_ERROR, "xpu api runtime error"}, {xpu::Error_t::NO_ENOUGH_WORKSPACE, "xpu api no enough workspace"}}; +template +class XPUTypeTrait { + public: + using Type = T; +}; + +template <> +class XPUTypeTrait { + public: + using Type = float16; +}; + #endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b30214e1d83559f31758a85c797e3a410ad1ad61..f1435f1b916cb0815da44cb2d7c75937023f71df 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -56,6 +56,7 @@ set(PYBIND_SRCS ir.cc inference_api.cc compatible.cc + io.cc generator_py.cc) if(WITH_ASCEND) @@ -73,6 +74,14 @@ if (WITH_CRYPTO) set(PYBIND_SRCS ${PYBIND_SRCS} crypto.cc) endif (WITH_CRYPTO) +if (WITH_PSLIB) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(DISTRIBUTE_COMPILE_FLAGS + "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") + endif() + set_source_files_properties(heter_wrapper_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +endif(WITH_PSLIB) if (WITH_PSCORE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor -Wno-error=sign-compare -Wno-error=unused-variable -Wno-error=return-type -Wno-error=unused-but-set-variable -Wno-error=type-limits -Wno-error=unknown-pragmas -Wno-error=parentheses -Wno-error=unused-result") set_source_files_properties(fleet_py.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -120,14 +129,20 @@ if(WITH_PYTHON) else() set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") endif() + file(TO_NATIVE_PATH ${op_function_generator_path} op_function_generator_path) + file(TO_NATIVE_PATH ${impl_file} impl_file) + file(TO_NATIVE_PATH ${tmp_impl_file} tmp_impl_file) + file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat "" "set build_times=1\n" ":retry\n" "ECHO op_function_generator run %build_times% time\n" - "${op_function_generator_path}/op_function_generator ${impl_file}\n" + "if exist ${tmp_impl_file} del ${tmp_impl_file}\n" + "taskkill /f /im op_function_generator.exe 2>NUL\n" + "${op_function_generator_path}\\op_function_generator.exe ${tmp_impl_file}\n" "if %ERRORLEVEL% NEQ 0 (\n" " set /a build_times=%build_times%+1\n" - " if %build_times% GTR 5 (\n" + " if %build_times% GEQ 10 (\n" " exit /b 1\n" " ) else (\n" " goto :retry\n" @@ -137,6 +152,8 @@ if(WITH_PYTHON) add_custom_command(TARGET op_function_generator POST_BUILD COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} + COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" ) if(${CBLAS_PROVIDER} STREQUAL MKLML) @@ -168,7 +185,7 @@ if(WITH_PYTHON) "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}" COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} - COMMENT "copy_if_different ${impl_file}" + COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" VERBATIM ) if(WITH_MKL) diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc index 9a1fa1d7704c213239b3b1857622309fc63a5ded..43725f7dc0f73e438834b108f8f65069f96db575 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.cc +++ b/paddle/fluid/pybind/ascend_wrapper_py.cc @@ -108,12 +108,14 @@ enum AttrType { AT_NAMEATTR }; +#ifdef PADDLE_WITH_ASCEND void BindAscendDevice(py::module *m) { py::class_(*m, "NPUDevice") .def_static( "get_device_count", static_cast(&platform::ascend::NPUDevice::GetDeviceCount)); } +#endif void BindAscendGraph(py::module *m) { m->def("ge_initialize", &ge_initialize, "GEInitialize"); diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index 91461aa26f341a91f942fc44a70064fa49ece31c..a6b542f53ae1785252b8993982345fd233902458 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/distributed/common/sparse_sharding_merge.h" #include "paddle/fluid/distributed/communicator_common.h" #include "paddle/fluid/distributed/fleet.h" #include "paddle/fluid/distributed/index_dataset/index_sampler.h" @@ -48,6 +49,7 @@ using paddle::distributed::GraphNode; using paddle::distributed::GraphPyServer; using paddle::distributed::GraphPyClient; using paddle::distributed::FeatureNode; +using paddle::distributed::ShardingMerge; namespace paddle { namespace pybind { @@ -56,6 +58,8 @@ void BindDistFleetWrapper(py::module* m) { "DistFleetWrapper") .def(py::init([]() { return FleetWrapper::GetInstance(); })) .def("load_sparse", &FleetWrapper::LoadSparseOnServer) + .def("load_model", &FleetWrapper::LoadModel) + .def("load_one_table", &FleetWrapper::LoadModelOneTable) .def("init_server", &FleetWrapper::InitServer) .def("run_server", (uint64_t (FleetWrapper::*)(void)) & FleetWrapper::RunServer) @@ -85,6 +89,12 @@ void BindPSHost(py::module* m) { .def("to_string", &distributed::PSHost::to_string); } +void BindSparseShardingTools(py::module* m) { + py::class_(*m, "ShardingMerge") + .def(py::init<>()) + .def("merge", &ShardingMerge::Merge); +} + void BindCommunicatorContext(py::module* m) { py::class_(*m, "CommContext") .def( diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h index 206a69f5a80197b15b5f579faefdad2075461c2c..4dc0f002ad3c1d9580ce8301cc74009555f552a3 100644 --- a/paddle/fluid/pybind/fleet_py.h +++ b/paddle/fluid/pybind/fleet_py.h @@ -36,5 +36,6 @@ void BindIndexNode(py::module* m); void BindTreeIndex(py::module* m); void BindIndexWrapper(py::module* m); void BindIndexSampler(py::module* m); +void BindSparseShardingTools(py::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index bc8d1e5b40585dd8a44255b33c835be12c473cec..4824a34e843bb1eb3074ad59554a3adb61f99554 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -41,6 +41,7 @@ DECLARE_int32(multiple_of_cupti_buffer_size); DECLARE_bool(reader_queue_speed_test_mode); DECLARE_int32(call_stack_level); DECLARE_bool(sort_sum_gradient); +DECLARE_bool(check_kernel_launch); // device management DECLARE_int32(paddle_num_threads); // executor @@ -376,7 +377,7 @@ static void RegisterGlobalVarGetterSetter() { FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb, FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math, FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce, - FLAGS_conv2d_disable_cudnn); + FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch); #endif #ifdef PADDLE_WITH_XPU REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 93441eb52fe5ed93c3b03781d42abe8a3c7dfc40..619301e3b45d3116a545dd16ef1d5dc165a4f210 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -51,6 +51,8 @@ limitations under the License. */ namespace paddle { namespace pybind { +PyTypeObject *g_varbase_pytype = nullptr; + namespace py = ::pybind11; class Layer : public imperative::Layer { @@ -133,30 +135,44 @@ static const platform::Place PyObjectToPlace(const py::object &place_obj) { return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); + } else if (py::isinstance(place_obj)) { + return place_obj.cast(); } else if (py::isinstance(place_obj)) { return place_obj.cast(); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Place should be one of " - "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace")); + "Place/CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace")); } } -static void InitTensorForVarBase(imperative::VarBase *self, - const py::array &array, - const platform::Place place, - bool persistable = false, - bool zero_copy = false, std::string name = "", - int stop_gradient = -1) { - if (name == "") { - name = - imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor"); - } - VLOG(5) << "Init Tensor as: / name: " << name - << " / persistable: " << persistable << " / zero_copy: " << zero_copy +// only initialize varbase, but not its tensor. +static void InitVarBaseOnly(imperative::VarBase *self, const std::string &name, + bool persistable = false, int stop_gradient = -1) { + auto name_ = name == "" + ? imperative::GetCurrentTracer()->GenerateUniqueName( + "generated_tensor") + : name; + + VLOG(5) << "Init Tensor as: / name: " << name_ + << " / persistable: " << persistable << " / stop_gradient: " << stop_gradient; - new (self) imperative::VarBase(name); + new (self) imperative::VarBase(name_); + if (stop_gradient != -1) { + self->SetOverridedStopGradient(stop_gradient); + } + self->SetPersistable(persistable); + self->SetType(framework::proto::VarType::LOD_TENSOR); +} + +// initialize varbase and its tensor. +static void InitVarBaseAndTensor( + imperative::VarBase *self, const py::array &array, + const platform::Place &place, const std::string &name, + bool persistable = false, bool zero_copy = false, int stop_gradient = -1) { + InitVarBaseOnly(self, name, persistable, stop_gradient); auto *tensor = self->MutableVar()->GetMutable(); + VLOG(4) << "zero_copy: " << zero_copy; if (platform::is_cpu_place(place)) { SetTensorFromPyArray( tensor, array, BOOST_GET_CONST(platform::CPUPlace, place), zero_copy); @@ -170,30 +186,23 @@ static void InitTensorForVarBase(imperative::VarBase *self, SetTensorFromPyArray( tensor, array, BOOST_GET_CONST(platform::CUDAPinnedPlace, place), zero_copy); + } else if (platform::is_npu_place(place)) { + SetTensorFromPyArray( + tensor, array, BOOST_GET_CONST(platform::NPUPlace, place), zero_copy); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Place should be one of CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace")); - } - if (stop_gradient != -1) { - self->SetOverridedStopGradient(stop_gradient); + "Place should be one of " + "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/NPUPlace")); } - self->SetPersistable(persistable); - self->SetType(framework::proto::VarType::LOD_TENSOR); self->SetDataType(tensor->type()); } static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self, const py::kwargs &kwargs) { VLOG(4) << "Init VarBase from kwargs: "; - PADDLE_ENFORCE_EQ( - kwargs.contains("value"), true, - platform::errors::NotFound( - "The kwargs used to create Varbase misses argument: value")); auto persistable = kwargs.contains("persistable") ? kwargs["persistable"].cast() : false; - auto array = kwargs.contains("value") ? kwargs["value"].cast() - : py::array(); auto zero_copy = kwargs.contains("zero_copy") ? kwargs["zero_copy"].cast() : false; auto name = kwargs.contains("name") ? kwargs["name"].cast() : ""; @@ -201,10 +210,18 @@ static void InitVarBaseFromNumpyWithKwargs(imperative::VarBase *self, ? kwargs["stop_gradient"].cast() : -1; auto default_place = imperative::GetCurrentTracer()->ExpectedPlace(); - auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"]) - : default_place; - InitTensorForVarBase(self, array, place, persistable, zero_copy, name, - stop_gradient); + + if (kwargs.contains("value")) { + auto array = kwargs["value"].cast(); + // place is only used when array is given, otherwise, it is meaningless and + // ignored + auto place = kwargs.contains("place") ? PyObjectToPlace(kwargs["place"]) + : default_place; + InitVarBaseAndTensor(self, array, place, name, persistable, zero_copy, + stop_gradient); + } else { + InitVarBaseOnly(self, name, persistable, stop_gradient); + } } template @@ -239,11 +256,11 @@ static void InitVarBaseFromNumpyWithArgDefault(imperative::VarBase *self, const py::array &array) { auto place = imperative::GetCurrentTracer()->ExpectedPlace(); VLOG(4) << "Init VarBase from numpy at " << place; - InitTensorForVarBase(self, array, place); + InitVarBaseAndTensor(self, array, place, ""); } static void InitVarBaseFromTensorWithArgDefault( - imperative::VarBase *self, const framework::LoDTensor &tensor) { + imperative::VarBase *self, const framework::Tensor &tensor) { VLOG(4) << "Init VarBase"; auto place = imperative::GetCurrentTracer()->ExpectedPlace(); new (self) imperative::VarBase( @@ -469,6 +486,62 @@ static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index, if (!PyTuple_Check(_index)) Py_DecRef(index); } +template +static void VarBaseCopy(std::shared_ptr &src, // NOLINT + imperative::VarBase &dst, // NOLINT + const P &dst_device, const bool blocking) { + if (dst.SharedVar()->IsEmpty()) { + VLOG(3) << "deep copy Variable from " << src->Name() << " to " + << dst.Name(); + dst.SetPersistable(src->Persistable()); + dst.SetDataType(src->DataType()); + dst.SetType(src->Type()); + dst.SetOverridedStopGradient(src->OverridedStopGradient()); + if (!src->SharedVar()->IsEmpty()) { + if (src->Var().IsType()) { + auto &src_tensor = src->Var().Get(); + auto *dst_tensor = dst.MutableVar()->GetMutable(); + dst_tensor->set_lod(src_tensor.lod()); + framework::TensorCopy(src_tensor, dst_device, dst_tensor); + if (blocking) { + platform::DeviceContextPool::Instance().Get(dst_device)->Wait(); + auto src_device = src_tensor.place(); + if (!(src_device == dst_device)) { + platform::DeviceContextPool::Instance().Get(src_device)->Wait(); + } + } + } else if (src->Var().IsType()) { + auto &src_selected_rows = src->Var().Get(); + auto *dst_selected_rows = + dst.MutableVar()->GetMutable(); + dst_selected_rows->set_height(src_selected_rows.height()); + dst_selected_rows->set_rows(src_selected_rows.rows()); + framework::TensorCopy(src_selected_rows.value(), dst_device, + dst_selected_rows->mutable_value()); + if (blocking) { + platform::DeviceContextPool::Instance().Get(dst_device)->Wait(); + auto src_device = src_selected_rows.value().place(); + if (!(src_device == dst_device)) { + platform::DeviceContextPool::Instance().Get(src_device)->Wait(); + } + } + } + + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(src, dst_device); + } + + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The source Tensor(%s) can not copy when it is empty.", src->Name())); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The destion Tensor(%s) can not copy when it is not empty.", + dst.Name())); + } +} + // Bind Methods void BindImperative(py::module *m_ptr) { auto &m = *m_ptr; @@ -611,9 +684,10 @@ void BindImperative(py::module *m_ptr) { imperative::SetCurrentTracer(tracer); }); - py::class_>( - m, "VarBase", R"DOC()DOC") - .def_static("_alive_vars", &imperative::VarBase::AliveVarNames) + py::class_> varbase( + m, "VarBase", R"DOC()DOC"); + g_varbase_pytype = (PyTypeObject *)varbase.ptr(); // NOLINT + varbase.def_static("_alive_vars", &imperative::VarBase::AliveVarNames) .def("__init__", [](imperative::VarBase &self) { std::string name = @@ -659,6 +733,10 @@ void BindImperative(py::module *m_ptr) { py::arg("value"), py::arg("place"), py::arg("persistable") = false, py::arg("zero_copy") = false, py::arg("name") = "", py::arg("stop_gradient") = -1) + .def("__init__", &InitVarBaseFromNumpyWithArg, + py::arg("value"), py::arg("place"), py::arg("persistable") = false, + py::arg("zero_copy") = false, py::arg("name") = "", + py::arg("stop_gradient") = -1) .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value")) .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor")) .def("__init__", &InitVarBaseFromNumpyWithKwargs) @@ -710,6 +788,13 @@ void BindImperative(py::module *m_ptr) { imperative::NameVarBaseMap ins = {{"Input", {self}}}; imperative::NameVarBaseMap outs = {{"Out", {self}}}; + PADDLE_ENFORCE_EQ( + self->IsLeaf() && !self->OverridedStopGradient(), false, + platform::errors::InvalidArgument( + "Leaf Tensor (%s) that doesn't stop gradient can't use " + "inplace strategy.", + self->Name())); + auto value_tensor = value_obj.cast>(); ins.insert({"ValueTensor", {value_tensor}}); @@ -784,6 +869,70 @@ void BindImperative(py::module *m_ptr) { return out; } }) + .def( + "_getitem_from_offset", + [](std::shared_ptr &self, const py::args &args) { + const auto &tensor = self->Var().Get(); + PADDLE_ENFORCE_EQ( + tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "Tensor of %s is Empty, please check if it has no data.", + self->Name())); + + const auto &tensor_dims = tensor.dims(); + + std::vector dims(tensor_dims.size()); + std::vector strides(tensor_dims.size()); + + size_t numel = 1; + for (int i = tensor_dims.size() - 1; i >= 0; --i) { + strides[i] = numel; + dims[i] = static_cast(tensor_dims[i]); + numel *= dims[i]; + } + size_t offset = 0; + if (args.empty()) { + PADDLE_ENFORCE_EQ( + numel, 1, + platform::errors::InvalidArgument( + "only one element tensors can be converted to Python " + "scalars when no input coordinates")); + } else if (args.size() == 1) { + offset = args[0].cast(); + PADDLE_ENFORCE_LT( + offset, numel, + platform::errors::InvalidArgument( + "index %d is out of bounds for size %d", offset, numel)); + } else { + PADDLE_ENFORCE_EQ(args.size(), dims.size(), + platform::errors::InvalidArgument( + "incorrect number of indices for Tensor")); + + for (size_t i = 0; i < args.size(); ++i) { + size_t index = args[i].cast(); + PADDLE_ENFORCE_LT( + index, dims[i], + platform::errors::InvalidArgument( + "index %d is out fo bounds for axis %d with size %d", + index, i, dims[i])); + offset += index * strides[i]; + } + } +#define TENSOR_TO_PY_SCALAR(T, proto_type) \ + if (tensor.type() == proto_type) { \ + std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(proto_type); \ + T b = TensorGetElement(tensor, offset); \ + return py::array(py::dtype(py_dtype_str.c_str()), {}, {}, \ + static_cast(&b)); \ + } + + _ForEachDataType_(TENSOR_TO_PY_SCALAR); +#undef TENSOR_TO_PY_SCALAR + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported tensor data type: %s", + framework::DataTypeToString(tensor.type()))); + }, + py::return_value_policy::copy) .def("_inplace_version", [](imperative::VarBase &self) -> uint32_t { const auto &var = self.MutableVar(); @@ -1322,6 +1471,16 @@ void BindImperative(py::module *m_ptr) { return new_var; }, py::return_value_policy::copy) + .def("_copy_to", + [](const std::shared_ptr &self, + const platform::NPUPlace &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) .def("_copy_to", [](const std::shared_ptr &self, const platform::Place &place, bool blocking) { @@ -1341,28 +1500,22 @@ void BindImperative(py::module *m_ptr) { &imperative::VarBase::SetOverridedStopGradient) .def_property("persistable", &imperative::VarBase::Persistable, &imperative::VarBase::SetPersistable) - .def_property_readonly("shape", - [](imperative::VarBase &self) { - if (self.Var().IsType()) { - return framework::vectorize( - self.Var() - .Get() - .dims()); - } else if (self.Var() - .IsType< - framework::SelectedRows>()) { - return framework::vectorize( - self.Var() - .Get() - .value() - .dims()); - } else { - VLOG(2) << "It is meaningless to get shape of " - "variable type " - << GetTypeName(self); - return std::vector(); - } - }) + .def_property_readonly( + "shape", + [](imperative::VarBase &self) { + if (self.Var().IsType()) { + return framework::vectorize( + self.Var().Get().dims()); + } else if (self.Var().IsType()) { + return framework::vectorize( + self.Var().Get().value().dims()); + } else { + VLOG(2) << "It is meaningless to get shape of " + "variable type " + << GetTypeName(self); + return std::vector(); + } + }) .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf, R"DOC( Whether a Tensor is leaf Tensor. @@ -1454,6 +1607,11 @@ void BindImperative(py::module *m_ptr) { self.SetExpectedPlace(*p); VLOG(4) << "Tracer(" << &self << ")" << " set expected place " << *p; + } else if (py::isinstance(obj)) { + auto p = obj.cast(); + self.SetExpectedPlace(*p); + VLOG(4) << "Tracer(" << &self << ")" + << " set expected place " << *p; } else if (py::isinstance(obj)) { auto p = obj.cast(); self.SetExpectedPlace(*p); @@ -1462,7 +1620,7 @@ void BindImperative(py::module *m_ptr) { } else { PADDLE_THROW(platform::errors::InvalidArgument( "Incompatible Place Type: supports XPUPlace, CUDAPlace, " - "CPUPlace, " + "CPUPlace, NPUPlace" "and CUDAPinnedPlace, " "but got Unknown Type!")); } @@ -1523,6 +1681,19 @@ void BindImperative(py::module *m_ptr) { std::move(attrs), place, trace_backward); } }) + .def("trace", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::NPUPlace &place, + bool trace_backward) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp(type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward); + } + }) .def("trace", [](imperative::Tracer &self, const std::string &type, const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, @@ -1575,6 +1746,13 @@ void BindImperative(py::module *m_ptr) { self.nrings_ = nrings; }); + m.def("varbase_copy", &VarBaseCopy); + m.def("varbase_copy", &VarBaseCopy); + m.def("varbase_copy", &VarBaseCopy); + m.def("varbase_copy", &VarBaseCopy); + m.def("varbase_copy", &VarBaseCopy); + m.def("varbase_copy", &VarBaseCopy); + m.def( "dygraph_partial_grad", [](const std::vector> &input_targets, @@ -1674,6 +1852,12 @@ void BindImperative(py::module *m_ptr) { const py::args args, const py::kwargs kwargs) { return imperative::PyLayerApply(place, cls, args, kwargs); }); + + m.def("pylayer_apply", + [](const platform::NPUPlace &place, const py::object &cls, + const py::args args, const py::kwargs kwargs) { + return imperative::PyLayerApply(place, cls, args, kwargs); + }); } } // namespace pybind diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 8a5ad5852aedf5b157876c5d892d2ac4f42c022d..b2572e5aa4ba150c788ff2f0f728230f152aa76c 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -511,6 +511,7 @@ void BindAnalysisConfig(py::module *m) { py::arg("disable_trt_plugin_fp16") = false) .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS) .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled) + .def("exp_disable_tensorrt_ops", &AnalysisConfig::Exp_DisableTensorRtOPs) .def("enable_tensorrt_dla", &AnalysisConfig::EnableTensorRtDLA, py::arg("dla_core") = 0) .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) diff --git a/paddle/fluid/pybind/io.cc b/paddle/fluid/pybind/io.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc49f76305461f2f99ebad8f1c4a6a34cb1e5382 --- /dev/null +++ b/paddle/fluid/pybind/io.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/io.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/pybind/pybind_boost_headers.h" + +namespace py = pybind11; +namespace paddle { +namespace pybind { + +void BindIO(pybind11::module *m) { + m->def("save_lod_tensor", [](const paddle::framework::LoDTensor &tensor, + const std::string &str_file_name) { + std::ofstream fout(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ(static_cast(fout), true, + platform::errors::Unavailable( + "Cannot open %s to save variables.", str_file_name)); + paddle::framework::SerializeToStream(fout, tensor); + + int64_t tellp = fout.tellp(); + fout.close(); + return tellp; + }); + + m->def("load_lod_tensor", [](paddle::framework::LoDTensor &tensor, + const std::string &str_file_name) { + std::ifstream fin(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ(static_cast(fin), true, + platform::errors::Unavailable( + "Cannot open %s to load variables.", str_file_name)); + + paddle::framework::DeserializeFromStream(fin, &tensor); + int64_t tellg = fin.tellg(); + fin.close(); + return tellg; + }); + + m->def("save_selected_rows", + [](const paddle::framework::SelectedRows &selected_rows, + const std::string &str_file_name) { + std::ofstream fout(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fout), true, + platform::errors::Unavailable( + "Cannot open %s to save SelectedRows.", str_file_name)); + + paddle::framework::SerializeToStream(fout, selected_rows); + int64_t tellp = fout.tellp(); + fout.close(); + return tellp; + }); + + m->def("load_selected_rows", + [](paddle::framework::SelectedRows &selected_rows, + const std::string &str_file_name) { + std::ifstream fin(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fin), true, + platform::errors::Unavailable( + "Cannot open %s to load SelectedRows.", str_file_name)); + + paddle::framework::DeserializeFromStream(fin, &selected_rows); + int64_t tellg = fin.tellg(); + fin.close(); + return tellg; + }); + + m->def("save_lod_tensor_to_memory", + [](const paddle::framework::LoDTensor &tensor) -> py::bytes { + std::ostringstream ss; + paddle::framework::SerializeToStream(ss, tensor); + return ss.str(); + }); + + m->def("load_lod_tensor_from_memory", [](paddle::framework::LoDTensor &tensor, + const std::string &tensor_bytes) { + std::istringstream fin(tensor_bytes, std::ios::in | std::ios::binary); + paddle::framework::DeserializeFromStream(fin, &tensor); + }); + + m->def("save_selected_rows_to_memory", + [](const paddle::framework::SelectedRows &selected_rows) -> py::bytes { + std::ostringstream ss; + paddle::framework::SerializeToStream(ss, selected_rows); + return ss.str(); + }); + + m->def("load_selected_rows_from_memory", + [](paddle::framework::SelectedRows &selected_rows, + const std::string &selected_rows_bytes) { + std::istringstream fin(selected_rows_bytes, + std::ios::in | std::ios::binary); + paddle::framework::DeserializeFromStream(fin, &selected_rows); + }); +} +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/operators/minus_op.cu b/paddle/fluid/pybind/io.h similarity index 60% rename from paddle/fluid/operators/minus_op.cu rename to paddle/fluid/pybind/io.h index 956d935da9b96696e9148fc4dfab23a6a6c29016..dfe3154cb95da529536c0022fc82169d476f3913 100644 --- a/paddle/fluid/operators/minus_op.cu +++ b/paddle/fluid/pybind/io.h @@ -1,10 +1,10 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -12,8 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/minus_op.h" +#pragma once -REGISTER_OP_CUDA_KERNEL( - minus, - paddle::operators::MinusKernel); +#include +#include "paddle/fluid/pybind/pybind_boost_headers.h" + +namespace paddle { +namespace pybind { +void BindIO(pybind11::module* m); +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/op_function.h b/paddle/fluid/pybind/op_function.h index 0c457531211b906d1d6e43363df4511aea5c7435..eaa70adcc89fe4c47f7c674e61d6a36ef36ad9c6 100644 --- a/paddle/fluid/pybind/op_function.h +++ b/paddle/fluid/pybind/op_function.h @@ -25,6 +25,7 @@ #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" @@ -34,6 +35,28 @@ namespace py = pybind11; namespace paddle { namespace pybind { +class OpAttrTypeMap { + public: + static OpAttrTypeMap& Instance() { + static OpAttrTypeMap g_op_attr_type_map; + return g_op_attr_type_map; + } + + std::unordered_map< + std::string, + std::unordered_map>& + Map() { + return ops_attrtype_map_; + } + + private: + OpAttrTypeMap() = default; + std::unordered_map< + std::string, + std::unordered_map> + ops_attrtype_map_; +}; + static inline std::shared_ptr CastPyHandleToVarBase( const std::string& op_type, const std::string& arg_name, int arg_idx, const py::handle& handle, bool dispensable = false) { @@ -173,6 +196,846 @@ static inline void HandleViewBetweenInputAndOutput( << "), share allocation and inplace version."; } } + +extern PyTypeObject* g_varbase_pytype; +extern PyTypeObject* g_vartype_pytype; +extern PyTypeObject* g_blockdesc_pytype; + +inline bool PyObject_CheckBool(PyObject** obj) { return PyBool_Check(*obj); } + +inline bool PyObject_CheckLongOrToLong(PyObject** obj) { + if ((PyLong_Check(*obj) && !PyBool_Check(*obj)) || + PyObject_IsInstance(*obj, (PyObject*)g_vartype_pytype) || // NOLINT + PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) { // NOLINT + return true; + } + + if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name) // NOLINT + .find("numpy") != std::string::npos) { + auto to = PyNumber_Long(*obj); + if (to) { + *obj = to; + return true; + } + } + + return false; +} + +inline bool PyObject_CheckFloatOrToFloat(PyObject** obj) { + // sometimes users provide PyLong or numpy.int64 but attr is float + if (PyFloat_Check(*obj) || PyLong_Check(*obj) || + PyObject_IsInstance(*obj, (PyObject*)g_varbase_pytype)) { // NOLINT + return true; + } + if (std::string(((PyTypeObject*)(*obj)->ob_type)->tp_name) // NOLINT + .find("numpy") != std::string::npos) { + auto to = PyNumber_Float(*obj); + if (to) { + *obj = to; + return true; + } + } + return false; +} + +inline bool PyObject_CheckString(PyObject* obj) { return PyUnicode_Check(obj); } + +static inline void CastPyArg2AttrBoolean( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (obj == Py_None) { + attrs[key] = false; // To be compatible with QA integration testing. Some + // test case pass in None. + } else if (obj == Py_True) { + attrs[key] = true; + } else if (obj == Py_False) { + attrs[key] = false; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "bool, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrInt( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyObject_CheckLongOrToLong(&obj)) { + attrs[key] = (int)PyLong_AsLong(obj); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "int, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrLong( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyObject_CheckLongOrToLong(&obj)) { + attrs[key] = (int64_t)PyLong_AsLong(obj); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "long, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrFloat( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyObject_CheckFloatOrToFloat(&obj)) { + attrs[key] = (float)PyFloat_AsDouble(obj); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "float, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrString( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyObject_CheckString(obj)) { + Py_ssize_t size; + const char* data; + data = PyUnicode_AsUTF8AndSize(obj, &size); + attrs[key] = std::string(data, (size_t)size); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "str, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrBooleans( + PyObject* obj, paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckBool(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of bool, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckBool(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of bool, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrInts( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrLongs( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckLongOrToLong(&item)) { + value.emplace_back(PyLong_AsLong(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of int, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrFloats( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrFloat64s( + PyObject* obj, paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PySequence_Check(obj)) { + Py_ssize_t len = PySequence_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PySequence_GetItem(obj, i); + if (PyObject_CheckFloatOrToFloat(&item)) { + value.emplace_back(PyFloat_AsDouble(item)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of float, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrStrings( + PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyList_GetItem(obj, i); + if (PyObject_CheckString(item)) { + Py_ssize_t size; + const char* data; + data = PyUnicode_AsUTF8AndSize(item, &size); + value.emplace_back(std::string(data, (size_t)size)); // NOLINT + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of str, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else if (PyTuple_Check(obj)) { + Py_ssize_t len = PyTuple_Size(obj); + PyObject* item = nullptr; + std::vector value; + for (Py_ssize_t i = 0; i < len; i++) { + item = PyTuple_GetItem(obj, i); + if (PyObject_CheckString(item)) { + Py_ssize_t size; + const char* data; + data = PyUnicode_AsUTF8AndSize(item, &size); + value.emplace_back(std::string(data, (size_t)size)); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list of str, but got %s at pos %d", + op_type, arg_pos + 1, + ((PyTypeObject*)item->ob_type)->tp_name, // NOLINT + i)); + } + } + attrs[key] = value; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "list or tuple, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } +} + +static inline void CastPyArg2AttrBlock( + PyObject* obj, paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, const std::string& op_type, ssize_t arg_pos) { + ::pybind11::detail::instance* inst = + (::pybind11::detail::instance*)obj; // NOLINT + + if (!PyObject_IsInstance((PyObject*)inst, // NOLINT + (PyObject*)g_blockdesc_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "BlockDesc, but got %s", + op_type, arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + void** vh = inst->simple_layout ? inst->simple_value_holder + : &inst->nonsimple.values_and_holders[0]; + attrs[key] = reinterpret_cast(vh[0]); +} + +static inline void ConstructAttrMapFromPyArgs( + const std::string& op_type, PyObject* args, ssize_t attr_start, + ssize_t attr_end, paddle::framework::AttributeMap& attrs) { // NOLINT + PADDLE_ENFORCE_EQ( + (attr_end - attr_start) % 2, 0, + platform::errors::InvalidArgument( + "The number of arguments for attributes should be even.")); + + auto attr_type_map = &(OpAttrTypeMap::Instance().Map()[op_type]); + + PyObject* obj = nullptr; + for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) { + Py_ssize_t key_len; + const char* key_ptr; + obj = PyTuple_GET_ITEM(args, arg_pos); + if (PyObject_CheckString(obj)) { + key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be str, but got " + "%s", + op_type, arg_pos, ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + std::string key(key_ptr, (size_t)key_len); + auto iter = attr_type_map->find(key); + if (iter == attr_type_map->end()) { + continue; + } + + obj = PyTuple_GET_ITEM(args, arg_pos + 1); + + switch (iter->second) { + case paddle::framework::proto::AttrType::INT: + CastPyArg2AttrInt(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::FLOAT: + CastPyArg2AttrFloat(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::STRING: + CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::INTS: + CastPyArg2AttrInts(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::FLOATS: + CastPyArg2AttrFloats(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::STRINGS: + CastPyArg2AttrStrings(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::BOOLEAN: + CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::BOOLEANS: + CastPyArg2AttrBooleans(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::LONG: + CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::LONGS: + CastPyArg2AttrLongs(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::FLOAT64S: + CastPyArg2AttrFloat64s(obj, attrs, key, op_type, arg_pos); + break; + case paddle::framework::proto::AttrType::BLOCK: + CastPyArg2AttrBlock(obj, attrs, key, op_type, arg_pos); + break; + default: + break; + } + } +} + +static inline std::shared_ptr GetVarBaseFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false) { + ::pybind11::detail::instance* inst = + (::pybind11::detail::instance*)PyTuple_GET_ITEM(args, arg_idx); + + if (PyTuple_Check((PyObject*)inst)) { // NOLINT + inst = (::pybind11::detail::instance*)PyTuple_GET_ITEM(inst, 0); + } + + if (inst == nullptr || (PyObject*)inst == Py_None) { // NOLINT + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be Tensor, but got None", + op_type, arg_name, arg_idx)); + } + return nullptr; + } + + if (!PyObject_IsInstance((PyObject*)inst, // NOLINT + (PyObject*)g_varbase_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be Tensor, but got " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)((PyObject*)inst)->ob_type)->tp_name)); // NOLINT + } + + void** vh = inst->simple_layout ? inst->simple_value_holder + : &inst->nonsimple.values_and_holders[0]; + return reinterpret_cast&>(vh[1]); +} + +static inline std::vector> +GetVarBaseListFromArgs(const std::string& op_type, const std::string& arg_name, + PyObject* args, ssize_t arg_idx, + bool dispensable = false) { + PyObject* list = PyTuple_GET_ITEM(args, arg_idx); + + if (list == nullptr) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensor, but got " + "None", + op_type, arg_name, arg_idx)); // NOLINT + } + return {}; + } + + std::vector> result; + + if (PyList_Check(list)) { + Py_ssize_t len = PyList_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + ::pybind11::detail::instance* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = (::pybind11::detail::instance*)PyList_GetItem(list, i); + if (!PyObject_IsInstance((PyObject*)item, // NOLINT + (PyObject*)g_varbase_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but " + "got list of " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name)); // NOLINT + } + void** vh = item->simple_layout ? item->simple_value_holder + : &item->nonsimple.values_and_holders[0]; + result.emplace_back( + reinterpret_cast&>( + vh[1])); + } + } else if (PyTuple_Check(list)) { + Py_ssize_t len = PyTuple_Size(list); + if (len == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "empty list", + op_type, arg_name, arg_idx)); + } + ::pybind11::detail::instance* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + item = (::pybind11::detail::instance*)PyTuple_GetItem(list, i); // NOLINT + if (!PyObject_IsInstance((PyObject*)item, // NOLINT + (PyObject*)g_varbase_pytype)) { // NOLINT + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but " + "got list of " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)((PyObject*)item)->ob_type)->tp_name)); // NOLINT + } + void** vh = item->simple_layout ? item->simple_value_holder + : &item->nonsimple.values_and_holders[0]; + result.emplace_back( + reinterpret_cast&>( + vh[1])); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be list of Tensors, but got " + "%s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)list->ob_type)->tp_name)); // NOLINT + } + + return result; +} + +static inline unsigned long GetUnsignedLongFromArgs( // NOLINT + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false) { + PyObject* item = PyTuple_GET_ITEM(args, arg_idx); + + if (item == nullptr) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be long, but got None", + op_type, arg_name, arg_idx)); + } + return 0; + } + + if (PyObject_CheckLongOrToLong(&item)) { + return PyLong_AsUnsignedLong(item); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be " + "long, but got %s", + op_type, arg_name, arg_idx, + ((PyTypeObject*)item->ob_type)->tp_name)); // NOLINT + } +} + +static inline PyObject* MakeReturnPyObject( + const std::shared_ptr& out) { + return ::pybind11::detail::type_caster_base::cast_holder( + ::pybind11::detail::holder_helper< + std::shared_ptr>::get(out), + &out) + .ptr(); +} + +static inline PyObject* MakeReturnPyObject( + const std::vector>& out) { + PyObject* result = PyList_New((Py_ssize_t)out.size()); + + for (size_t i = 0; i < out.size(); i++) { + PyList_SET_ITEM( + result, (Py_ssize_t)i, + ::pybind11::detail::type_caster_base::cast_holder( + ::pybind11::detail::holder_helper< + std::shared_ptr>::get(out[i]), + &out[i]) + .ptr()); // NOLINT + } + + return result; +} + +template +struct TupleVarBasesResult { + static void Run(const Tuple& out, PyObject* result) { + TupleVarBasesResult::Run(out, result); + PyTuple_SET_ITEM(result, N - 1, MakeReturnPyObject(std::get(out))); + } +}; + +template +struct TupleVarBasesResult { + static void Run(const Tuple& out, PyObject* result) { + PyTuple_SET_ITEM(result, 0, MakeReturnPyObject(std::get<0>(out))); + } +}; + +template +static inline PyObject* MakeReturnPyObject(const std::tuple& out) { + auto len = sizeof...(Args); + PyObject* result = PyTuple_New(len); + + TupleVarBasesResult::Run(out, result); + + return result; +} + +void InitOpsAttrTypeMap() { + auto op_info_map = paddle::framework::OpInfoMap::Instance().map(); + for (auto iter = op_info_map.begin(); iter != op_info_map.end(); ++iter) { + auto op_proto = iter->second.proto_; + if (op_proto == nullptr) { + continue; + } + auto attrs_proto = op_proto->attrs(); + for (auto& attr : attrs_proto) { + OpAttrTypeMap::Instance().Map()[iter->first][attr.name()] = attr.type(); + } + } +} + +void ThrowExceptionToPython(std::exception_ptr p) { + static PyObject* EOFExceptionException = + PyErr_NewException("paddle.EOFException", PyExc_Exception, NULL); + static PyObject* EnforceNotMetException = + PyErr_NewException("paddle.EnforceNotMet", PyExc_Exception, NULL); + try { + if (p) std::rethrow_exception(p); + } catch (const platform::EOFException& e) { + PyErr_SetString(EOFExceptionException, e.what()); + } catch (const platform::EnforceNotMet& e) { + switch (e.code()) { + case paddle::platform::error::INVALID_ARGUMENT: + PyErr_SetString(PyExc_ValueError, e.what()); + break; + case paddle::platform::error::NOT_FOUND: + case paddle::platform::error::ALREADY_EXISTS: + case paddle::platform::error::PRECONDITION_NOT_MET: + case paddle::platform::error::PERMISSION_DENIED: + case paddle::platform::error::EXECUTION_TIMEOUT: + case paddle::platform::error::UNAVAILABLE: + PyErr_SetString(PyExc_RuntimeError, e.what()); + break; + case paddle::platform::error::OUT_OF_RANGE: + PyErr_SetString(PyExc_IndexError, e.what()); + break; + case paddle::platform::error::RESOURCE_EXHAUSTED: + PyErr_SetString(PyExc_MemoryError, e.what()); + break; + case paddle::platform::error::UNIMPLEMENTED: + PyErr_SetString(PyExc_NotImplementedError, e.what()); + break; + case paddle::platform::error::FATAL: + PyErr_SetString(PyExc_SystemError, e.what()); + break; + case paddle::platform::error::EXTERNAL: + PyErr_SetString(PyExc_OSError, e.what()); + break; + default: + PyErr_SetString(EnforceNotMetException, e.what()); + break; + } + } +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 237cec13a80259190fb97a42d5a3b86c1c9a48fe..b2205391a253c35f1c1e2852ddfe1a28666066b9 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -65,6 +65,7 @@ std::map> op_ins_map = { {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}}, {"momentum", {"Param", "Grad", "Velocity", "LearningRate"}}, {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}}, + {"run_program", {"X", "Params"}}, }; // NOTE(zhiqiu): Like op_ins_map. @@ -98,6 +99,7 @@ std::map> op_outs_map = { {"rnn", {"DropoutState", "Reserve", "Out", "State"}}, {"lamb", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, + {"run_program", {"DOut"}}, }; // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are @@ -123,6 +125,7 @@ std::map> op_passing_outs_map = { {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"accuracy", {"Correct", "Total"}}, {"fill_constant", {"Out"}}, + {"recv_v2", {"Out"}}, {"matmul", {"Out"}}, {"c_broadcast", {"Out"}}, {"c_sync_calc_stream", {"Out"}}, @@ -147,6 +150,7 @@ std::map> op_passing_outs_map = { {"lamb", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"rnn", {"DropoutState"}}, + {"run_program", {"Out", "DOut", "OutScope"}}, }; // NOTE(pangyoki): Tensor View Strategy. @@ -172,7 +176,7 @@ std::set inplace_op_duplicable_ins_set = { // clang-format off const char* OUT_INITIALIZER_TEMPLATE = - R"({"%s", {std::shared_ptr(new imperative::VarBase(tracer->GenerateUniqueName()))}})"; + R"({"%s", {std::shared_ptr(new imperative::VarBase("auto_"+std::to_string(VarBaseUniqueNameID++)+"_"))}})"; const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableOutput(%s)})"; const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})"; @@ -208,16 +212,17 @@ const char* OUT_VAR_TYPE = R"(std::shared_ptr)"; const char* OUT_VAR_LIST_TYPE = R"(std::vector>)"; const char* CAST_VAR_TEMPLATE = R"( - auto %s = CastPyHandleToVarBase("%s", "%s", %d, %s, %s);)"; + auto %s = GetVarBaseFromArgs("%s", "%s", args, %d, %s);)"; const char* CAST_VAR_LIST_TEMPLATE = R"( - auto %s = CastPyHandleToVarBaseList("%s", "%s", %d, %s, %s);)"; + auto %s = GetVarBaseListFromArgs("%s", "%s", args, %d, %s);)"; +const char* CAST_SIZE_T_TEMPLATE = R"( + auto %s = GetUnsignedLongFromArgs("%s", "%s", args, %d, %s);)"; const char* ARG_TEMPLATE = R"(const %s& %s)"; const char* RETURN_TUPLE_TYPE = R"(std::tuple<%s>)"; -const char* RETURN_TYPE = R"(%s)"; const char* RETURN_TUPLE_TEMPLATE = R"(std::make_tuple(%s))"; const char* RETURN_LIST_TEMPLATE = R"(outs["%s"])"; const char* RETURN_TEMPLATE = R"(outs["%s"][0])"; @@ -247,24 +252,34 @@ const char* INPLACE_MAPPING_TEMPLATE = R"({"%s", "%s"})"; const char* OP_FUNCTION_TEMPLATE = R"( -%s %s(%s) +static PyObject * %s(PyObject *self, PyObject *args, PyObject *kwargs) { - %s - framework::AttributeMap attrs; - ConstructAttrMapFromPyArgs("%s", %d, &attrs, args); + PyThreadState *tstate = nullptr; + try { - py::gil_scoped_release release; - auto tracer = imperative::GetCurrentTracer(); + %s + framework::AttributeMap attrs; + ConstructAttrMapFromPyArgs("%s", args, %d, PyTuple_GET_SIZE(args) , attrs); + tstate = PyEval_SaveThread(); %s imperative::NameVarBaseMap outs = %s; imperative::NameVarBaseMap ins = %s; %s - tracer->TraceOp("%s", ins, outs, attrs, {%s}); - return %s; + imperative::GetCurrentTracer()->TraceOp("%s", ins, outs, attrs, {%s}); + PyEval_RestoreThread(tstate); + tstate = nullptr; + %s + } + catch(...) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + ThrowExceptionToPython(std::current_exception()); + return nullptr; } })"; -const char* PYBIND_ITEM_TEMPLATE = R"( %s.def("%s", &%s);)"; +const char* PYBIND_ITEM_TEMPLATE = R"( {"%s", (PyCFunction)(void(*)(void))%s, METH_VARARGS | METH_KEYWORDS, "C++ interface function for %s in dygraph."},)"; // clang-format on static inline bool FindInsMap(const std::string& op_type, @@ -323,9 +338,8 @@ std::string GenerateOpFunctionsBody( const auto in_cast_type = input.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE; auto dispensable = input.dispensable() ? "true" : "false"; - ins_cast_str += - paddle::string::Sprintf(in_cast_type, in_name, op_type, in_name, - arg_idx++, TempName(in_name), dispensable); + ins_cast_str += paddle::string::Sprintf(in_cast_type, in_name, op_type, + in_name, arg_idx++, dispensable); if (input.dispensable()) { const auto in_template = input.duplicable() @@ -353,7 +367,6 @@ std::string GenerateOpFunctionsBody( // Generate outs initializer std::string outs_initializer = "{"; std::string outs_initializer_with_null = ""; - std::string return_type = ""; std::string inplace_mapping_str = ""; std::string return_str = ""; @@ -392,6 +405,12 @@ std::string GenerateOpFunctionsBody( paddle::string::Sprintf(out_template, out_name, out_name); outs_initializer += ","; } + + const auto in_cast_type = + output.duplicable() ? CAST_VAR_LIST_TEMPLATE : CAST_VAR_TEMPLATE; + auto dispensable = output.dispensable() ? "true" : "false"; + ins_cast_str += paddle::string::Sprintf(in_cast_type, out_name, op_type, + out_name, arg_idx++, dispensable); } else if (use_inplace_strategy && inplace_map.count(out_name)) { PADDLE_ENFORCE_NE( inplace_map[out_name], "", @@ -437,6 +456,11 @@ std::string GenerateOpFunctionsBody( input_args_num++; outs_initializer += paddle::string::Sprintf( OUT_DUPLICABLE_INITIALIZER_TEMPLATE, out_name, out_num_str); + + auto dispensable = output.dispensable() ? "true" : "false"; + ins_cast_str += + paddle::string::Sprintf(CAST_SIZE_T_TEMPLATE, out_num_str, op_type, + out_num_str, arg_idx++, dispensable); } else { outs_initializer += paddle::string::Sprintf(OUT_INITIALIZER_TEMPLATE, out_name); @@ -444,15 +468,12 @@ std::string GenerateOpFunctionsBody( outs_initializer += ","; } - return_type += out_type; - return_type += ","; return_str += paddle::string::Sprintf(return_template, out_name); return_str += ","; outs_num += 1; } if (outs_initializer.back() == ',') { outs_initializer.pop_back(); - return_type.pop_back(); return_str.pop_back(); } outs_initializer += "}"; @@ -467,11 +488,13 @@ std::string GenerateOpFunctionsBody( viwe_input_name, viwe_output_name); } if (outs_num == 0) { - return_type = "void"; - } - if (outs_num > 1) { - return_str = paddle::string::Sprintf(RETURN_TUPLE_TEMPLATE, return_str); - return_type = paddle::string::Sprintf(RETURN_TUPLE_TYPE, return_type); + return_str = "Py_INCREF(Py_None);\n return Py_None;"; + } else if (outs_num == 1) { + return_str = "return MakeReturnPyObject(" + return_str + ");"; + } else { + return_str = "return MakeReturnPyObject(" + + paddle::string::Sprintf(RETURN_TUPLE_TEMPLATE, return_str) + + ");"; } std::string function_args = ""; if (input_args == "") { @@ -482,17 +505,17 @@ std::string GenerateOpFunctionsBody( // generate op funtcion body auto op_function_str = paddle::string::Sprintf( - OP_FUNCTION_TEMPLATE, return_type, func_name, function_args, ins_cast_str, - op_type, input_args_num, inplace_strategy_str, outs_initializer, - ins_initializer, ins_initializer_with_null + outs_initializer_with_null + - view_strategy_str, + OP_FUNCTION_TEMPLATE, func_name, ins_cast_str, op_type, input_args_num, + inplace_strategy_str, outs_initializer, ins_initializer, + ins_initializer_with_null + outs_initializer_with_null + + view_strategy_str, op_type, inplace_mapping_str, return_str); return op_function_str; } static std::tuple, std::vector> -GenerateOpFunctions(const std::string& module_name) { +GenerateOpFunctions() { auto& op_info_map = paddle::framework::OpInfoMap::Instance().map(); std::vector op_function_list, bind_function_list; @@ -533,7 +556,7 @@ GenerateOpFunctions(const std::string& module_name) { // generate pybind item auto bind_function_str = paddle::string::Sprintf( - PYBIND_ITEM_TEMPLATE, module_name, op_type, func_name); + PYBIND_ITEM_TEMPLATE, op_type, func_name, op_type); op_function_list.emplace_back(std::move(op_function_str)); bind_function_list.emplace_back(std::move(bind_function_str)); @@ -548,8 +571,8 @@ GenerateOpFunctions(const std::string& module_name) { // generate pybind item auto inplace_bind_function_str = - paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, module_name, - inplace_op_type, inplace_func_name); + paddle::string::Sprintf(PYBIND_ITEM_TEMPLATE, inplace_op_type, + inplace_func_name, inplace_op_type); op_function_list.emplace_back(std::move(inplace_op_function_str)); bind_function_list.emplace_back(std::move(inplace_bind_function_str)); @@ -569,7 +592,9 @@ int main(int argc, char* argv[]) { ascend_ptr->InitGEForUT(); #endif - std::vector headers{"\"paddle/fluid/imperative/tracer.h\""}; + std::vector headers{"\"paddle/fluid/imperative/tracer.h\"", + "\"pybind11/detail/common.h\"", + ""}; std::ofstream out(argv[1], std::ios::out); @@ -579,21 +604,29 @@ int main(int argc, char* argv[]) { out << "#include " + header + "\n"; } - auto op_funcs = GenerateOpFunctions("m"); + out << "\n\n"; + + auto op_funcs = GenerateOpFunctions(); - out << "namespace py = pybind11;" - << "\n"; out << "namespace paddle {\n" - << "namespace pybind {\n"; + << "namespace pybind {\n\n"; + out << "std::atomic VarBaseUniqueNameID{0};\n"; out << paddle::string::join_strings(std::get<0>(op_funcs), '\n'); out << "\n\n"; - out << "inline void BindOpFunctions(pybind11::module *module) {\n" - << " auto m = module->def_submodule(\"ops\");\n\n"; + out << "static PyMethodDef ExtestMethods[] = {\n" + << paddle::string::join_strings(std::get<1>(op_funcs), '\n') + << "\n {nullptr,nullptr,0,nullptr}" + << "};\n\n"; - out << paddle::string::join_strings(std::get<1>(op_funcs), '\n'); - out << "\n"; - out << "}\n\n" + out << "inline void BindOpFunctions(pybind11::module *module) {\n" + << " auto m = module->def_submodule(\"ops\");\n" + << " if (PyModule_AddFunctions(m.ptr(), ExtestMethods) < 0) {\n" + << " PADDLE_THROW(platform::errors::Fatal (\"Add functions to " + "core.ops failed!\"));\n" + << " }\n\n" + << " InitOpsAttrTypeMap();" + << "}\n\n" << "} // namespace pybind\n" << "} // namespace paddle\n"; diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 6fa49a85423c58061975007f9c2f4467c8d1ad09..f4b68eb438200e39cbd26cb5e297c62408d4a9cb 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -29,6 +29,9 @@ limitations under the License. */ namespace paddle { namespace pybind { +PyTypeObject *g_vartype_pytype = nullptr; +PyTypeObject *g_blockdesc_pytype = nullptr; + namespace pd = paddle::framework; template @@ -82,8 +85,9 @@ void BindProgramDesc(pybind11::module *m) { } void BindBlockDesc(pybind11::module *m) { - pybind11::class_(*m, "BlockDesc", "") - .def_property_readonly("id", &pd::BlockDesc::ID) + pybind11::class_ blockdesc(*m, "BlockDesc", ""); + g_blockdesc_pytype = (PyTypeObject *)blockdesc.ptr(); // NOLINT + blockdesc.def_property_readonly("id", &pd::BlockDesc::ID) .def_property_readonly("parent", &pd::BlockDesc::Parent) .def("get_forward_block_idx", &pd::BlockDesc::ForwardBlockID) .def("_set_forward_block_idx", &pd::BlockDesc::SetForwardBlockID) @@ -174,8 +178,9 @@ void BindVarDsec(pybind11::module *m) { .def("need_check_feed", &pd::VarDesc::NeedCheckFeed) .def("set_need_check_feed", &pd::VarDesc::SetNeedCheckFeed); - pybind11::enum_(var_desc, "VarType", "") - .value("BOOL", pd::proto::VarType::BOOL) + pybind11::enum_ vartype(var_desc, "VarType", ""); + g_vartype_pytype = (PyTypeObject *)vartype.ptr(); // NOLINT + vartype.value("BOOL", pd::proto::VarType::BOOL) .value("UINT8", pd::proto::VarType::UINT8) .value("INT8", pd::proto::VarType::INT8) .value("INT16", pd::proto::VarType::INT16) diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 0c239f8157e5dff03ba71bb018c77b7b5a4b86a6..48365f42b11ba9a7afc4cb3578c2bbbc7002fc84 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -47,7 +47,11 @@ void BindPSGPUWrapper(py::module* m) { py::call_guard()) .def("end_pass", &framework::PSGPUWrapper::EndPass, py::call_guard()) - .def("build_gpu_ps", &framework::PSGPUWrapper::BuildGPUPS, + .def("begin_pass", &framework::PSGPUWrapper::BeginPass, + py::call_guard()) + .def("load_into_memory", &framework::PSGPUWrapper::LoadIntoMemory, + py::call_guard()) + .def("finalize", &framework::PSGPUWrapper::Finalize, py::call_guard()); } // end PSGPUWrapper #endif diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 560d8c892b09f9b6f17136040455ee8469587f53..4a43e51e7cabcfe76418f7187f755bb0bce5455d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/framework/custom_operator.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/garbage_collector.h" @@ -68,6 +69,7 @@ limitations under the License. */ #include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/pybind/io.h" #ifdef PADDLE_WITH_ASCEND #include "paddle/fluid/pybind/ascend_wrapper_py.h" #endif @@ -184,6 +186,14 @@ bool IsCompiledWithMKLDNN() { #endif } +bool IsCompiledWithHETERPS() { +#ifndef PADDLE_WITH_HETERPS + return false; +#else + return true; +#endif +} + bool SupportsBfloat16() { #ifndef PADDLE_WITH_MKLDNN return false; @@ -224,7 +234,9 @@ OpSupportedInfos(const std::string &place, [](unsigned char c) { return std::toupper(c); }); using fn_type = std::add_pointer::type; std::unordered_map is_target_place{ - {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, + {"GPU", &platform::is_gpu_place}, + {"CPU", &platform::is_cpu_place}, + {"XPU", &platform::is_xpu_place}, }; PADDLE_ENFORCE_NE( is_target_place.count(query_place), 0, @@ -496,70 +508,6 @@ PYBIND11_MODULE(core_noavx, m) { #endif return tensor; }); - m.def("_save_lod_tensor", [](const LoDTensor &tensor, - const std::string &str_file_name) { - std::ofstream fout(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fout), true, - platform::errors::Unavailable( - "Cannot open %s to save variables.", str_file_name)); - SerializeToStream(fout, tensor); - - int64_t tellp = fout.tellp(); - fout.close(); - return tellp; - }); - m.def("_load_lod_tensor", [](LoDTensor &tensor, - const std::string &str_file_name) { - std::ifstream fin(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fin), true, - platform::errors::Unavailable( - "Cannot open %s to load variables.", str_file_name)); - - DeserializeFromStream(fin, &tensor); - int64_t tellg = fin.tellg(); - fin.close(); - return tellg; - }); - m.def("_save_selected_rows", [](const SelectedRows &selected_rows, - const std::string &str_file_name) { - std::ofstream fout(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fout), true, - platform::errors::Unavailable("Cannot open %s to save SelectedRows.", - str_file_name)); - - SerializeToStream(fout, selected_rows); - int64_t tellp = fout.tellp(); - fout.close(); - return tellp; - }); - m.def("_load_selected_rows", - [](SelectedRows &selected_rows, const std::string &str_file_name) { - std::ifstream fin(str_file_name, std::ios::binary); - PADDLE_ENFORCE_EQ( - static_cast(fin), true, - platform::errors::Unavailable( - "Cannot open %s to load SelectedRows.", str_file_name)); - - DeserializeFromStream(fin, &selected_rows); - int64_t tellg = fin.tellg(); - fin.close(); - return tellg; - }); - m.def("_save_static_dict", - [](const std::string &str_file_name, const py::handle &vec_var_list, - const Scope &scope) { - std::vector vec_name_list = GetNameList(vec_var_list); - SaveStaticNameListToDisk(str_file_name, vec_name_list, scope); - }); - - m.def("_load_static_dict", - [](const std::string &str_file_name, const py::handle &vec_var_list, - const Scope &scope, const Executor *executor) { - std::vector vec_name_list = GetNameList(vec_var_list); - CreateVariableIfNotExit(vec_var_list, scope, executor); - LoadStaticNameListFromDisk(str_file_name, vec_name_list, scope); - }); m.def("_create_loaded_parameter", [](const py::handle &vec_var_list, const Scope &scope, @@ -567,26 +515,6 @@ PYBIND11_MODULE(core_noavx, m) { CreateVariableIfNotExit(vec_var_list, scope, executor); }); - m.def("_save_dygraph_dict", [](const std::string &str_file_name, - const PyNameVarBaseMap &state_dict) { - auto vec_var_base_list = GetVarBaseList(state_dict); - - SaveDygraphVarBaseListToDisk(str_file_name, vec_var_base_list); - }); - - m.def("_load_dygraph_dict", [](const std::string &str_file_name) { - auto load_tensor = LoadDygraphVarBaseListFromDisk(str_file_name); - - std::unordered_map> - map_output; - - for (size_t i = 0; i < load_tensor.size(); ++i) { - map_output.emplace(load_tensor[i]->Name(), load_tensor[i]); - } - - return map_output; - }); - m.def("save_op_version_info", [](framework::ProgramDesc &desc) { framework::compatible::pb::OpVersionMap pb_vmap{desc.OpVersionMap()}; framework::compatible::SaveOpVersions( @@ -1391,7 +1319,7 @@ All parameter, weight, gradient are variables in Paddle. if (info != nullptr) { if (info->HasOpProtoAndChecker()) { auto op_checker = info->Checker(); - res = op_checker->GetAttrsDefaultValuesMap(); + res = op_checker->GetDefaultAttrsMap(); } } return res; @@ -1799,6 +1727,8 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("get_device_id", + [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) .def("__str__", string::to_string); py::class_(m, "Place") @@ -1920,6 +1850,8 @@ All parameter, weight, gradient are variables in Paddle. py::return_value_policy::reference) .def("finalize", &TrainerBase::Finalize); + m.def("_get_eager_deletion_vars", &framework::GetEagerDeletionCleanVars); + py::class_(m, "Executor") .def(py::init()) .def("close", &Executor::Close) @@ -1989,6 +1921,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_npu", IsCompiledWithNPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); + m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS); m.def("supports_bfloat16", SupportsBfloat16); m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance); m.def("op_supported_infos", OpSupportedInfos); @@ -3111,6 +3044,7 @@ All parameter, weight, gradient are variables in Paddle. .def("device_count", &ParallelExecutor::DeviceCount); BindFleetWrapper(&m); + BindIO(&m); #ifdef PADDLE_WITH_PSLIB BindHeterWrapper(&m); @@ -3159,7 +3093,7 @@ All parameter, weight, gradient are variables in Paddle. BindTreeIndex(&m); BindIndexWrapper(&m); BindIndexSampler(&m); - + BindSparseShardingTools(&m); #endif } } // namespace pybind diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 416361d06a996e492118a995a6c0aa28ac38dc1a..68e6e049cdbb0cd508536741c4902143f65f8f76 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/platform/bfloat16.h" @@ -84,9 +85,9 @@ struct npy_format_descriptor { static constexpr auto name = _("bfloat16"); }; -// we register paddle::platform::complex64 as numpy.complex64. +// we register paddle::platform::complex as numpy.complex64. template <> -struct npy_format_descriptor { +struct npy_format_descriptor> { static py::dtype dtype() { handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX64); return reinterpret_borrow(ptr); @@ -103,9 +104,8 @@ struct npy_format_descriptor { static constexpr auto name = _("complext64"); }; -// we register paddle::platform::complex128 as numpy.complex128. template <> -struct npy_format_descriptor { +struct npy_format_descriptor> { static py::dtype dtype() { handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_COMPLEX128); return reinterpret_borrow(ptr); @@ -168,8 +168,8 @@ struct ValidDTypeToPyArrayChecker { DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::float16); DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::bfloat16); -DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex64); -DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex128); +DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex); +DECLARE_VALID_DTYPE_TO_PY_ARRAY(platform::complex); DECLARE_VALID_DTYPE_TO_PY_ARRAY(float); DECLARE_VALID_DTYPE_TO_PY_ARRAY(double); DECLARE_VALID_DTYPE_TO_PY_ARRAY(bool); @@ -188,9 +188,9 @@ inline std::string TensorDTypeToPyDTypeStr( } else if (std::is_same::value) { \ /* NumPy character code of uint16 due to no support for bfloat16 */ \ return "H"; \ - } else if (std::is_same::value) { \ + } else if (std::is_same>::value) { \ return "F"; \ - } else if (std::is_same::value) { \ + } else if (std::is_same>::value) { \ return "D"; \ } else { \ constexpr auto kIsValidDType = ValidDTypeToPyArrayChecker::kValue; \ @@ -367,12 +367,14 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, } else if (py::isinstance>(array)) { SetTensorFromPyArrayT(self, array, place, zero_copy); - } else if (py::isinstance>(array)) { - SetTensorFromPyArrayT(self, array, place, - zero_copy); - } else if (py::isinstance>(array)) { - SetTensorFromPyArrayT(self, array, place, - zero_copy); + } else if (py::isinstance>>( + array)) { + SetTensorFromPyArrayT, P>( + self, array, place, zero_copy); + } else if (py::isinstance>>( + array)) { + SetTensorFromPyArrayT, P>( + self, array, place, zero_copy); } else if (py::isinstance>(array)) { // since there is still no support for bfloat16 in NumPy, // uint16 is used for casting bfloat16 @@ -401,8 +403,8 @@ void _sliceCompute(const framework::Tensor *in, framework::Tensor *out, auto out_dims = out->dims(); auto in_dims = in->dims(); - auto offsets = Eigen::array(); - auto extents = Eigen::array(); + auto offsets = Eigen::DSizes(); + auto extents = Eigen::DSizes(); for (size_t i = 0; i < D; ++i) { offsets[i] = 0; extents[i] = out_dims[i]; @@ -422,7 +424,8 @@ void _sliceCompute(const framework::Tensor *in, framework::Tensor *out, auto out_t = framework::EigenTensor::From( *out); - out_t.device(eigen_place) = in_t.slice(offsets, extents); + operators::EigenSlice, T, D>::Eval( + eigen_place, out_t, in_t, offsets, extents); } template @@ -594,9 +597,9 @@ inline framework::Tensor *_sliceTensor(const framework::Tensor &self, case framework::proto::VarType::BF16: return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::COMPLEX64: - return _sliceAndConcat(self, obj, dim); + return _sliceAndConcat>(self, obj, dim); case framework::proto::VarType::COMPLEX128: - return _sliceAndConcat(self, obj, dim); + return _sliceAndConcat>(self, obj, dim); case framework::proto::VarType::FP32: return _sliceAndConcat(self, obj, dim); case framework::proto::VarType::FP64: diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py index e9153583f133771650cf115369dae231e6f1a3f0..2fe02dc51bf536d4132395c9c893f3fb1e9fbb74 100644 --- a/paddle/scripts/conda_build.py +++ b/paddle/scripts/conda_build.py @@ -44,42 +44,33 @@ build: self.requirement_build = r""" requirements: build: - - numpy>=1.12 + - numpy>=1.13 - cython - setuptools """ self.requirement_run = r""" run: - - numpy>1.12 + - requests>=2.20.0 + - numpy>=1.13 + - protobuf>=3.1.0 + - gast==0.3.3 + - Pillow - six - decorator - - nltk - - scipy - - requests - - pillow - - graphviz - - protobuf - - py-cpuinfo==5.0.0 - astor - - gast>=0.3.3 - - matplotlib """ self.requirement_run_windows = r""" run: - - numpy>=1.12 + - requests>=2.20.0 + - numpy>=1.13 + - protobuf>=3.1.0 + - gast==0.3.3 + - Pillow - six - decorator - - nltk - - scipy - - requests - - pillow - - graphviz - - protobuf - astor - - gast>=0.3.3 - - py-cpuinfo==5.0.0 """ self.test = r""" test: @@ -96,37 +87,20 @@ about: """ self.build_const = r""" -pip install /package/objgraph-3.4.1.tar.gz -pip install /package/rarfile-3.0.tar.gz --no-deps """ self.blt_const = r""" -pip install C:\package\objgraph-3.4.1.tar.gz -pip install C:\package\rarfile-3.0.tar.gz --no-deps -git clone https://github.com/PaddlePaddle/recordio.git -cd recordio\python -python setup.py install """ - self.python27 = r" - python>=2.7, <3.0" - self.python35 = r" - python>=3.5, <3.6" self.python36 = r" - python>=3.6, <3.7" self.python37 = r" - python>=3.7, <3.8" self.python38 = r" - python>=3.8, <3.9" + self.python39 = r" - python>=3.9, <3.10" self.python_version = [ - self.python27, self.python35, self.python36, self.python37, - self.python38 + self.python36, self.python37, self.python38, self.python39 ] - self.cuda90 = r""" - - cudatoolkit>=9.0, <9.1 - - cudnn>=7.6, <7.7 - """ - self.cuda100 = r""" - - cudatoolkit>=10.0, <10.1 - - cudnn>=7.6, <7.7 - """ self.cuda101 = r""" - cudatoolkit>=10.1, <10.2 - cudnn>=7.6, <7.7 @@ -135,30 +109,31 @@ python setup.py install - cudatoolkit>=10.2, <10.3 - cudnn>=7.6, <7.7 """ - self.cuda_info = [(self.cuda90, "cuda9.0", ".post90"), - (self.cuda100, "cuda10.0", ".post100"), - (self.cuda101, "cuda10.1", ".post101"), - (self.cuda102, "cuda10.2", "")] - self.py_str = ["py27", "py35", "py36", "py37", "py38"] + self.cuda112 = r""" + - cudatoolkit>=11.2, <11.3 + - cudnn>=8.1, <8.2 + """ + + self.cuda_info = [(self.cuda101, "cuda10.1", ".post101"), + (self.cuda102, "cuda10.2", ""), + (self.cuda112, "cuda11.2", ".post112")] + self.py_str = ["py36", "py37", "py38", "py39"] self.pip_end = ".whl --no-deps" self.pip_prefix_linux = "pip install /package/paddlepaddle" self.pip_prefix_windows = r"pip install C:\package\paddlepaddle" self.pip_gpu = "_gpu-" self.pip_cpu = "-" self.mac_pip = [ - "-cp27-cp27m-macosx_10_6_intel", "-cp35-cp35m-macosx_10_6_intel", "-cp36-cp36m-macosx_10_6_intel", "-cp37-cp37m-macosx_10_6_intel", - "-cp38-cp38-macosx_10_14_x86_64" + "-cp38-cp38-macosx_10_14_x86_64", "-cp39-cp39-macosx_10_14_x86_64" ] self.linux_pip = [ - "-cp27-cp27mu-manylinux1_x86_64", "-cp35-cp35m-manylinux1_x86_64", - "-cp36-cp36m-manylinux1_x86_64", "-cp37-cp37m-manylinux1_x86_64", - "-cp38-cp38-manylinux1_x86_64" + "-cp36-cp36m-linux_x86_64", "-cp37-cp37m-linux_x86_64", + "-cp38-cp38-linux_x86_64", "-cp39-cp39-linux_x86_64" ] self.windows_pip = [ - "-cp27-cp27m-win_amd64", "-cp35-cp35m-win_amd64", "-cp36-cp36m-win_amd64", "-cp37-cp37m-win_amd64", - "-cp38-cp38-win_amd64" + "-cp38-cp38-win_amd64", "-cp39-cp39-win_amd64" ] @@ -233,12 +208,7 @@ package: requirement = var.requirement_build + python_str + var.requirement_run_windows + python_str meta_build = var.build + build_name_str meta_str = package_str + meta_build + requirement - if (python_str == var.python27 or python_str == var.python35): - meta_str = meta_str + """ - - matplotlib<=2.2.4""" - else: - meta_str = meta_str + """ - - matplotlib""" + if not (cuda_str == None): meta_str = meta_str + cuda_str diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 787f5297e740541f71faba73677c4af3b8037099..bebcfe64406d9ed43ae665e50fa280dc0595a057 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -18,74 +18,87 @@ rem Paddle CI Task On Windows Platform rem ================================================= @ECHO ON -setlocal +setlocal enabledelayedexpansion rem -------clean up environment----------- set work_dir=%cd% -set cache_dir=%work_dir:Paddle=cache% +if not defined cache_dir set cache_dir=%work_dir:Paddle=cache% if not exist %cache_dir%\tools ( git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools ) -taskkill /f /im op_function_generator.exe 2>NUL -taskkill /f /im cmake.exe 2>NUL -taskkill /f /im MSBuild.exe 2>NUL -taskkill /f /im CL.exe 2>NUL -taskkill /f /im Lib.exe 2>NUL -taskkill /f /im link.exe 2>NUL -taskkill /f /im vctip.exe 2>NUL -taskkill /f /im cvtres.exe 2>NUL -taskkill /f /im rc.exe 2>NUL -taskkill /f /im mspdbsrv.exe 2>NUL -taskkill /f /im csc.exe 2>NUL -taskkill /f /im python.exe 2>NUL -taskkill /f /im nvcc.exe 2>NUL -taskkill /f /im cicc.exe 2>NUL -taskkill /f /im ptxas.exe 2>NUL -taskkill /f /im test_api_impl.exe 2>NUL -taskkill /f /im op_function_generator.exe 2>NUL +taskkill /f /im cmake.exe /t 2>NUL +taskkill /f /im ninja.exe /t 2>NUL +taskkill /f /im MSBuild.exe /t 2>NUL +taskkill /f /im cl.exe /t 2>NUL +taskkill /f /im lib.exe /t 2>NUL +taskkill /f /im link.exe /t 2>NUL +taskkill /f /im vctip.exe /t 2>NUL +taskkill /f /im cvtres.exe /t 2>NUL +taskkill /f /im rc.exe /t 2>NUL +taskkill /f /im mspdbsrv.exe /t 2>NUL +taskkill /f /im csc.exe /t 2>NUL +taskkill /f /im python.exe /t 2>NUL +taskkill /f /im nvcc.exe /t 2>NUL +taskkill /f /im cicc.exe /t 2>NUL +taskkill /f /im ptxas.exe /t 2>NUL +taskkill /f /im op_function_generator.exe /t 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL -wmic process where name="test_api_impl.exe" call terminate 2>NUL wmic process where name="cvtres.exe" call terminate 2>NUL wmic process where name="rc.exe" call terminate 2>NUL -wmic process where name="CL.exe" call terminate 2>NUL -wmic process where name="Lib.exe" call terminate 2>NUL +wmic process where name="cl.exe" call terminate 2>NUL +wmic process where name="lib.exe" call terminate 2>NUL wmic process where name="python.exe" call terminate 2>NUL rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" if not defined BRANCH set BRANCH=develop -if not defined WITH_TENSORRT set WITH_TENSORRT=ON +if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto if not defined WITH_GPU set WITH_GPU=ON if not defined WITH_MKL set WITH_MKL=ON if not defined WITH_AVX set WITH_AVX=ON if not defined WITH_TESTING set WITH_TESTING=ON -if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=OFF +if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON if not defined WITH_PYTHON set WITH_PYTHON=ON if not defined ON_INFER set ON_INFER=ON if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON -if not defined WITH_TPCACHE set WITH_TPCACHE=ON +if not defined WITH_TPCACHE set WITH_TPCACHE=OFF if not defined WITH_CLCACHE set WITH_CLCACHE=OFF if not defined WITH_CACHE set WITH_CACHE=OFF +if not defined WITH_SCCACHE set WITH_SCCACHE=OFF if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo if not defined LOG_LEVEL set LOG_LEVEL=normal if not defined PRECISION_TEST set PRECISION_TEST=OFF if not defined NIGHTLY_MODE set PRECISION_TEST=OFF -if not defined retry_times set retry_times=2 +if not defined retry_times set retry_times=3 +if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 + +rem ------initialize the python environment------ +set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe +set PATH=%PYTHON_ROOT%\Scripts;%PYTHON_ROOT%;%PATH% +if "%WITH_PYTHON%" == "ON" ( + where python + where pip + pip install wheel --user + pip install -r %work_dir%\python\requirements.txt --user + if !ERRORLEVEL! NEQ 0 ( + echo pip install requirements.txt failed! + exit /b 7 + ) +) -rem -------set cache build directory----------- +rem -------Caching strategy 1: keep build directory for incremental compilation----------- rmdir build\python /s/q +rmdir build\paddle\third_party\externalError /s/q +rem rmdir build\paddle\fluid\pybind /s/q rmdir build\paddle_install_dir /s/q rmdir build\paddle_inference_install_dir /s/q rmdir build\paddle_inference_c_install_dir /s/q del build\CMakeCache.txt -: set CI_SKIP_CPP_TEST if only *.py changed -git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON - if "%WITH_CACHE%"=="OFF" ( rmdir build /s/q goto :mkbuild @@ -114,11 +127,16 @@ if %ERRORLEVEL% EQU 0 ( git branch last_pr ) -:: git diff HEAD origin/develop --stat --name-only -:: git diff HEAD origin/develop --stat --name-only | findstr ".cmake CMakeLists.txt paddle_build.bat" -:: if %ERRORLEVEL% EQU 0 ( -:: rmdir build /s/q -:: ) +for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# +set day_now=%datetime:~6,2% +set day_before=-1 +set /p day_before=< %cache_dir%\day.txt +if %day_now% NEQ %day_before% ( + echo %day_now% > %cache_dir%\day.txt + type %cache_dir%\day.txt + rmdir build /s/q + goto :mkbuild +) :mkbuild if not exist build ( @@ -134,73 +152,49 @@ cd /d build dir . dir %cache_dir% dir paddle\fluid\pybind\Release - -rem ------initialize the python environment------ -if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 -set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe -set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH% - -rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled -rem Now use system python environment temporarily -rem %PYTHON_EXECUTABLE% -m pip install virtualenv -rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci -rem call paddle_winci\Scripts\activate.bat - -rem ------pre install python requirement---------- -where python -where pip -pip install wheel --user -pip install -r %work_dir%\python\unittest_py\requirements.txt --user -pip install -r %work_dir%\python\requirements.txt --user - -if %ERRORLEVEL% NEQ 0 ( - echo pip install requirements.txt failed! - exit /b 7 -) - -rem ------pre install clcache and init config---------- -rem pip install clcache --user -pip uninstall -y clcache -:: set USE_CLCACHE to enable clcache -rem set USE_CLCACHE=1 -:: In some scenarios, CLCACHE_HARDLINK can save one file copy. -rem set CLCACHE_HARDLINK=1 -:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported -rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 -:: set maximum cache size to 20G -rem clcache.exe -M 21474836480 - -:: install ninja if GENERATOR is Ninja -if %GENERATOR% == "Ninja" ( - pip install ninja - if %errorlevel% NEQ 0 ( - echo pip install ninja failed! - exit /b 7 - ) -) - -rem ------show summary of current environment---------- -cmake --version -if "%WITH_GPU%"=="ON" ( - nvcc --version - nvidia-smi +rem -------Caching strategy 1: End -------------------------------- + +rem -------Caching strategy 2: sccache decorate compiler----------- +if "%WITH_SCCACHE%"=="ON" ( + cmd /C sccache -V || call :install_sccache + sccache --stop-server 2> NUL + if not exist D:\sccache mkdir D:\sccache + set SCCACHE_DIR=D:\sccache\.cache + set SCCACHE_CACHE_SIZE=30G + set SCCACHE_ERROR_LOG=D:\sccache\sccache_log.txt + set SCCACHE_LOG=quiet + sccache --start-server + sccache -z + goto :CASE_%1 +) else ( + del %PYTHON_ROOT%\sccache.exe 2> NUL + goto :CASE_%1 ) -::python %work_dir%\tools\summary_env.py -::%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh -goto :CASE_%1 +:install_sccache +echo There is not sccache in this PC, will install sccache. +echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe +%PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')" +xcopy sccache.exe %PYTHON_ROOT%\Scripts\ /Y +goto:eof +rem -------Caching strategy 2: End -------------------------------- echo "Usage: paddle_build.bat [OPTION]" echo "OPTION:" -echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows" -echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows" +echo "wincheck_mkl: run Windows MKL/GPU PR CI tasks on Windows" +echo "wincheck_openbals: run Windows OPENBLAS/CPU PR CI tasks on Windows" +echo "build_avx_whl: build Windows avx whl package on Windows" +echo "build_no_avx_whl: build Windows no avx whl package on Windows" +echo "build_inference_lib: build Windows inference library on Windows" exit /b 1 rem ------PR CI windows check for MKL/GPU---------- :CASE_wincheck_mkl set WITH_MKL=ON set WITH_GPU=ON +set WITH_AVX=ON set MSVC_STATIC_CRT=OFF +set ON_INFER=ON call :cmake || goto cmake_error call :build || goto build_error @@ -212,16 +206,18 @@ goto:success rem ------PR CI windows check for OPENBLAS/CPU------ :CASE_wincheck_openblas -set WITH_MKL=ON +set WITH_MKL=OFF set WITH_GPU=OFF +set WITH_AVX=OFF set MSVC_STATIC_CRT=ON set retry_times=1 +set ON_INFER=OFF call :cmake || goto cmake_error call :build || goto build_error call :test_whl_pacakage || goto test_whl_pacakage_error call :test_unit || goto test_unit_error -call :test_inference || goto test_inference_error +:: call :test_inference || goto test_inference_error :: call :check_change_of_unittest || goto check_change_of_unittest_error goto:success @@ -251,20 +247,27 @@ goto:success rem ------Build windows inference library------ :CASE_build_inference_lib +set ON_INFER=ON set WITH_PYTHON=OFF set CUDA_ARCH_NAME=All +python %work_dir%\tools\remove_grad_op_and_kernel.py +if %errorlevel% NEQ 0 exit /b 1 call :cmake || goto cmake_error call :build || goto build_error -call :zip_file || goto zip_file_error +call :test_inference || goto test_inference_error +call :zip_cc_file || goto zip_cc_file_error +call :zip_c_file || goto zip_c_file_error goto:success rem "Other configurations are added here" rem :CASE_wincheck_others rem call ... + rem --------------------------------------------------------------------------------------------- :cmake +@ECHO OFF echo ======================================== echo Step 1. Cmake ... echo ======================================== @@ -274,16 +277,44 @@ call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary set DISTUTILS_USE_SDK=1 rem Windows 10 Kit bin dir set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH% +rem Use 64-bit ToolSet to compile +set PreferredToolArchitecture=x64 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% -@ECHO ON -if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0 +if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH% -rem ------set third_party cache dir------ +rem install ninja if GENERATOR is Ninja +if %GENERATOR% == "Ninja" ( + pip install ninja + if %errorlevel% NEQ 0 ( + echo pip install ninja failed! + exit /b 7 + ) +) + +rem ------show summary of current GPU environment---------- +cmake --version +if "%WITH_GPU%"=="ON" ( + nvcc --version + nvidia-smi 2>NUL +) + +rem ------pre install clcache and init config---------- +rem pip install clcache --user +pip uninstall -y clcache +:: set USE_CLCACHE to enable clcache +rem set USE_CLCACHE=1 +:: In some scenarios, CLCACHE_HARDLINK can save one file copy. +rem set CLCACHE_HARDLINK=1 +:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported +rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 +:: set maximum cache size to 20G +rem clcache.exe -M 21474836480 +rem ------set third_party cache dir------ : clear third party cache every once in a while for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# set day_now=%datetime:~6,2% @@ -320,25 +351,26 @@ echo echo ${md5_content}^>md5.txt >> cache.sh set /p md5=< md5.txt if "%WITH_GPU%"=="ON" ( - set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party_GPU/%md5% + set THIRD_PARTY_HOME=%cache_dir:\=/%/third_party_GPU ) else ( - set THIRD_PARTY_PATH=%cache_dir:\=/%/third_party/%md5% + set THIRD_PARTY_HOME=%cache_dir:\=/%/third_party ) +set THIRD_PARTY_PATH=%THIRD_PARTY_HOME%/%md5% :cmake_impl -echo cmake .. -G %GENERATOR% -T host=x64 -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% +-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub -cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% +-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -DCUB_PATH=%THIRD_PARTY_HOME%/cub goto:eof :cmake_error @@ -356,18 +388,26 @@ echo ======================================== for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*4/5 echo "PARALLEL PROJECT COUNT is %PARALLEL_PROJECT_COUNT%" + set build_times=1 +rem MSbuild will build third_party first to improve compiler stability. +if NOT %GENERATOR% == "Ninja" ( + goto :build_tp +) else ( + goto :build_paddle +) + :build_tp echo Build third_party the %build_times% time: - if %GENERATOR% == "Ninja" ( ninja third_party ) else ( - MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:quiet third_party.vcxproj + MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% third_party.vcxproj ) + if %ERRORLEVEL% NEQ 0 ( set /a build_times=%build_times%+1 - if %build_times% GTR %retry_times% ( + if %build_times% GEQ %retry_times% ( exit /b 7 ) else ( echo Build third_party failed, will retry! @@ -382,30 +422,34 @@ set build_times=1 rem clcache.exe -z rem -------clean up environment again----------- -taskkill /f /im MSBuild.exe 2>NUL -taskkill /f /im cl.exe 2>NUL -taskkill /f /im lib.exe 2>NUL -taskkill /f /im link.exe 2>NUL -taskkill /f /im vctip.exe 2>NUL -taskkill /f /im cvtres.exe 2>NUL -taskkill /f /im rc.exe 2>NUL -taskkill /f /im mspdbsrv.exe 2>NUL -taskkill /f /im csc.exe 2>NUL -taskkill /f /im nvcc.exe 2>NUL -taskkill /f /im cicc.exe 2>NUL -taskkill /f /im ptxas.exe 2>NUL -taskkill /f /im test_api_impl.exe 2>NUL -taskkill /f /im op_function_generator.exe 2>NUL +taskkill /f /im cmake.exe /t 2>NUL +taskkill /f /im MSBuild.exe /t 2>NUL +taskkill /f /im cl.exe /t 2>NUL +taskkill /f /im lib.exe /t 2>NUL +taskkill /f /im link.exe /t 2>NUL +taskkill /f /im vctip.exe /t 2>NUL +taskkill /f /im cvtres.exe /t 2>NUL +taskkill /f /im rc.exe /t 2>NUL +taskkill /f /im mspdbsrv.exe /t 2>NUL +taskkill /f /im csc.exe /t 2>NUL +taskkill /f /im nvcc.exe /t 2>NUL +taskkill /f /im cicc.exe /t 2>NUL +taskkill /f /im ptxas.exe /t 2>NUL +taskkill /f /im op_function_generator.exe /t 2>NUL +wmic process where name="cmake.exe" call terminate 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL -wmic process where name="test_api_impl.exe" call terminate 2>NUL wmic process where name="cvtres.exe" call terminate 2>NUL wmic process where name="rc.exe" call terminate 2>NUL -wmic process where name="CL.exe" call terminate 2>NUL -wmic process where name="Lib.exe" call terminate 2>NUL +wmic process where name="cl.exe" call terminate 2>NUL +wmic process where name="lib.exe" call terminate 2>NUL + +if "%WITH_TESTING%"=="ON" ( + for /F "tokens=1 delims= " %%# in ('tasklist ^| findstr /i test') do taskkill /f /im %%# /t +) echo Build Paddle the %build_times% time: if %GENERATOR% == "Ninja" ( - ninja -j %PARALLEL_PROJECT_COUNT% + ninja all ) else ( if "%WITH_CLCACHE%"=="OFF" ( MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj @@ -416,7 +460,7 @@ if %GENERATOR% == "Ninja" ( if %ERRORLEVEL% NEQ 0 ( set /a build_times=%build_times%+1 - if %build_times% GTR %retry_times% ( + if %build_times% GEQ %retry_times% ( exit /b 7 ) else ( echo Build Paddle failed, will retry! @@ -428,8 +472,10 @@ echo Build Paddle successfully! echo 0 > %cache_dir%\error_code.txt type %cache_dir%\error_code.txt -:: ci will collect clcache hit rate -rem goto :collect_clcache_hits +:: ci will collect sccache hit rate +if "%WITH_SCCACHE%"=="ON" ( + call :collect_sccache_hits +) goto:eof @@ -497,6 +543,15 @@ echo ======================================== echo Step 4. Running unit tests ... echo ======================================== +: set CI_SKIP_CPP_TEST if only *.py changed +git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON + +pip install -r %work_dir%\python\unittest_py\requirements.txt --user +if %ERRORLEVEL% NEQ 0 ( + echo pip install unittest requirements.txt failed! + exit /b 7 +) + for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% @@ -627,12 +682,12 @@ echo git fetch upstream $BRANCH # develop is not fetched>> check_change_of_ echo fi>> check_change_of_unittest.sh echo git checkout -b origin_pr >> check_change_of_unittest.sh echo git checkout -f $BRANCH >> check_change_of_unittest.sh -echo cmake .. -G %GENERATOR% -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ --DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ +echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ --DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT=%TENSORRT_ROOT% -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ --DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% >> check_change_of_unittest.sh +-DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ +-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> check_change_of_unittest.sh echo cat ^<^> check_change_of_unittest.sh echo ============================================ >> check_change_of_unittest.sh echo Generate unit tests.spec of develop. >> check_change_of_unittest.sh @@ -670,7 +725,8 @@ goto:eof exit /b 1 rem --------------------------------------------------------------------------------------------- -:zip_file +:zip_cc_file +cd /d %work_dir%\build tree /F %cd%\paddle_inference_install_dir\paddle if exist paddle_inference.zip del paddle_inference.zip python -c "import shutil;shutil.make_archive('paddle_inference', 'zip', root_dir='paddle_inference_install_dir')" @@ -682,10 +738,28 @@ for /F %%i in ("%libsize%") do ( ) goto:eof -:zip_file_error +:zip_cc_file_error echo Tar inference library failed! exit /b 1 +rem --------------------------------------------------------------------------------------------- +:zip_c_file +cd /d %work_dir%\build +tree /F %cd%\paddle_inference_c_install_dir\paddle +if exist paddle_inference_c.zip del paddle_inference_c.zip +python -c "import shutil;shutil.make_archive('paddle_inference_c', 'zip', root_dir='paddle_inference_c_install_dir')" +%cache_dir%\tools\busybox64.exe du -h -k paddle_inference_c.zip > lib_size.txt +set /p libsize=< lib_size.txt +for /F %%i in ("%libsize%") do ( + set /a libsize_m=%%i/1024 + echo "Windows Paddle_Inference CAPI ZIP Size: !libsize_m!M" +) +goto:eof + +:zip_c_file_error +echo Tar inference capi library failed! +exit /b 1 + :timestamp setlocal enabledelayedexpansion @ECHO OFF @@ -725,16 +799,22 @@ echo ipipe_log_param_Windows_%tempTaskName: =_%_Time: %cost_secs%s goto:eof -:collect_clcache_hits -for /f "tokens=2,4" %%i in ('clcache.exe -s ^| findstr "entries hits"') do set %%i=%%j -if %hits% EQU 0 ( - echo "clcache hit rate: 0%%" - echo ipipe_log_param_Clcache_Hit_Rate: 0%% +:collect_sccache_hits +sccache -s > sccache_summary.txt +echo ======================================== +echo sccache statistical summary ... +echo ======================================== +type sccache_summary.txt +for /f "tokens=2,3" %%i in ('type sccache_summary.txt ^| findstr "requests hits" ^| findstr /V "executed C/C++ CUDA"') do set %%i=%%j +if %requests% EQU 0 ( + echo "sccache hit rate: 0%" + echo ipipe_log_param_sccache_Hit_Hate: 0% ) else ( - set /a rate=%hits%*10000/%entries% - echo "clcache hit rate: %rate:~0,-2%.%rate:~-2%%%" - echo ipipe_log_param_Clcache_Hit_Hate: %rate:~0,-2%.%rate:~-2%%% + set /a rate=!hits!*10000/!requests! + echo "sccache hit rate: !rate:~0,-2!.!rate:~-2!%%" + echo ipipe_log_param_sccache_Hit_Hate: !rate:~0,-2!.!rate:~-2!%% ) + goto:eof @@ -743,31 +823,33 @@ rem ---------------------------------------------------------------------------- echo ======================================== echo Clean up environment at the end ... echo ======================================== -taskkill /f /im cmake.exe 2>NUL -taskkill /f /im MSBuild.exe 2>NUL -taskkill /f /im git.exe 2>NUL -taskkill /f /im cl.exe 2>NUL -taskkill /f /im lib.exe 2>NUL -taskkill /f /im link.exe 2>NUL -taskkill /f /im git-remote-https.exe 2>NUL -taskkill /f /im vctip.exe 2>NUL -taskkill /f /im cvtres.exe 2>NUL -taskkill /f /im rc.exe 2>NUL -taskkill /f /im mspdbsrv.exe 2>NUL -taskkill /f /im csc.exe 2>NUL -taskkill /f /im python.exe 2>NUL -taskkill /f /im nvcc.exe 2>NUL -taskkill /f /im cicc.exe 2>NUL -taskkill /f /im ptxas.exe 2>NUL -taskkill /f /im test_api_impl.exe 2>NUL -taskkill /f /im op_function_generator.exe 2>NUL +taskkill /f /im cmake.exe /t 2>NUL +taskkill /f /im ninja.exe /t 2>NUL +taskkill /f /im MSBuild.exe /t 2>NUL +taskkill /f /im git.exe /t 2>NUL +taskkill /f /im cl.exe /t 2>NUL +taskkill /f /im lib.exe /t 2>NUL +taskkill /f /im link.exe /t 2>NUL +taskkill /f /im git-remote-https.exe /t 2>NUL +taskkill /f /im vctip.exe /t 2>NUL +taskkill /f /im cvtres.exe /t 2>NUL +taskkill /f /im rc.exe /t 2>NUL +taskkill /f /im mspdbsrv.exe /t 2>NUL +taskkill /f /im csc.exe /t 2>NUL +taskkill /f /im python.exe /t 2>NUL +taskkill /f /im nvcc.exe /t 2>NUL +taskkill /f /im cicc.exe /t 2>NUL +taskkill /f /im ptxas.exe /t 2>NUL +taskkill /f /im op_function_generator.exe /t 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL -wmic process where name="test_api_impl.exe" call terminate 2>NUL wmic process where name="cvtres.exe" call terminate 2>NUL wmic process where name="rc.exe" call terminate 2>NUL -wmic process where name="CL.exe" call terminate 2>NUL -wmic process where name="Lib.exe" call terminate 2>NUL +wmic process where name="cl.exe" call terminate 2>NUL +wmic process where name="lib.exe" call terminate 2>NUL wmic process where name="python.exe" call terminate 2>NUL +if "%WITH_TESTING%"=="ON" ( + for /F "tokens=1 delims= " %%# in ('tasklist ^| findstr /i test') do taskkill /f /im %%# /t +) echo Windows CI run successfully! exit /b 0 diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index b8b9f40aa33fc26f1a75523bde2c079a6b4362ee..5c2309164dd026d753e40e5ddf351842f4f48249 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -79,37 +79,12 @@ function cmake_base() { # Delete previous built whl packages rm -rf python/dist 2>/dev/null || true - # Support build for all python versions, currently - # including cp27-cp27m and cp27-cp27mu. + # Support build for all python3 versions PYTHON_FLAGS="" SYSTEM=`uname -s` if [ "$SYSTEM" == "Darwin" ]; then echo "Using python abi: $1" - if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then - if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then - export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 - export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 - export PATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7 - -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7 - -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib" - pip install --user -r ${PADDLE_ROOT}/python/requirements.txt - else - exit 1 - fi - elif [ "$1" == "cp35-cp35m" ]; then - if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then - export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ - export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ - export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 - -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ - -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" - pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt - else - exit 1 - fi - elif [ "$1" == "cp36-cp36m" ]; then + if [ "$1" == "cp36-cp36m" ] || [ "$1" == "" ]; then if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ @@ -161,42 +136,7 @@ function cmake_base() { else if [ "$1" != "" ]; then echo "using python abi: $1" - if [ "$1" == "cp27-cp27m" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs4/lib:} - export PATH=/opt/python/cp27-cp27m/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs2/lib/libpython2.7.so" - pip install -r ${PADDLE_ROOT}/python/requirements.txt - elif [ "$1" == "cp27-cp27mu" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs2/lib:} - export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs4/lib/libpython2.7.so" - pip install -r ${PADDLE_ROOT}/python/requirements.txt - elif [ "$1" == "cp27-cp27m-gcc82" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs4/lib:} - export PATH=/opt/python/cp27-cp27m/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs2/lib/libpython2.7.so" - pip install -r ${PADDLE_ROOT}/python/requirements.txt - elif [ "$1" == "cp27-cp27mu-gcc82" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.15-ucs2/lib:} - export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} - PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python - -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.15-ucs4/lib/libpython2.7.so" - pip install -r ${PADDLE_ROOT}/python/requirements.txt - elif [ "$1" == "cp35-cp35m" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} - export PATH=/opt/_internal/cpython-3.5.1/bin/:${PATH} - export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 - -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m - -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" - pip3.5 install -r ${PADDLE_ROOT}/python/requirements.txt - elif [ "$1" == "cp36-cp36m" ]; then + if [ "$1" == "cp36-cp36m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3 @@ -248,6 +188,12 @@ function cmake_base() { distibuted_flag=${WITH_DISTRIBUTE:-OFF} gloo_flag=${distibuted_flag} + if [ "$CMD" != "assert_file_approvals" ];then + python -m pip install distro + python ${PADDLE_ROOT}/tools/summary_env.py + bash ${PADDLE_ROOT}/tools/get_cpu_info.sh + fi + cat <> ${PADDLE_ROOT}/build/build_summary.txt + echo "ipipe_log_param_Paddle_Inference_So_Size: $soLibSize" >> ${PADDLE_ROOT}/build/build_summary.txt + elif [ "$1" == "paddle_inference_c" ]; then + cd ${PADDLE_ROOT}/build + cp -r paddle_inference_c_install_dir paddle_inference_c + tar -czf paddle_inference_c.tgz paddle_inference_c + buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference_c.tgz |awk '{print $1}') + echo "Paddle_Inference Capi Size: $buildSize" + echo "ipipe_log_param_Paddle_Inference_capi_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt else SYSTEM=`uname -s` if [ "$SYSTEM" == "Darwin" ]; then @@ -591,11 +547,7 @@ EOF set -x set +ex - if [ "$1" == "cp27-cp27m" ]; then - pip uninstall -y paddlepaddle - elif [ "$1" == "cp35-cp35m" ]; then - pip3.5 uninstall -y paddlepaddle - elif [ "$1" == "cp36-cp36m" ]; then + if [ "$1" == "cp36-cp36m" ]; then pip3.6 uninstall -y paddlepaddle elif [ "$1" == "cp37-cp37m" ]; then pip3.7 uninstall -y paddlepaddle @@ -606,13 +558,7 @@ EOF fi set -ex - if [ "$1" == "cp27-cp27m" ]; then - set -e - pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl - python ${PADDLE_ROOT}/paddle/scripts/installation_validate.py - elif [ "$1" == "cp35-cp35m" ]; then - pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl - elif [ "$1" == "cp36-cp36m" ]; then + if [ "$1" == "cp36-cp36m" ]; then pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl elif [ "$1" == "cp37-cp37m" ]; then pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl @@ -662,8 +608,10 @@ EOF if [ $need_retry_ut_count -lt $exec_retry_threshold ];then while ( [ $exec_times -lt $retry_time ] ) do + set +e retry_unittests_record="$retry_unittests_record$failed_test_lists" failed_test_lists_ult=`echo "${failed_test_lists}"` + set -e if [[ "${exec_times}" == "1" ]];then if [[ "${failed_test_lists}" == "" ]];then break @@ -818,11 +766,6 @@ function generate_api_spec() { awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api - if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" || [ "$1" == "cp39-cp39" ]; then - # Use sed to make python2 and python3 sepc keeps the same - sed -i 's/arg0: str/arg0: unicode/g' $spec_path - sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path - fi python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py \ ${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_${spec_kind}.spec @@ -1228,21 +1171,21 @@ set +x fi if [[ "$is_exclusive" != "" ]]; then - if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then + if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$" else exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$" fi elif [[ "$is_multicard" != "" ]]; then - if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then + if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$" else multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$" fi else - if [[ $(echo $cpu_parallel_job | grep -o $testcase) != "" ]]; then + if [[ $(echo $cpu_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$" - elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then + elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o "\^$testcase\\$") != "" ]]; then single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$" else single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$" @@ -1256,11 +1199,13 @@ set +x testcase='' done <<< "$test_cases"; - card_test "$single_card_tests_high_parallel" 1 8 # run cases the most each time with single GPU + card_test "$single_card_tests_high_parallel" 1 6 # run cases the most each time with single GPU card_test "$single_card_tests_two_parallel" 1 2 # run cases 2 job each time with single GPU card_test "$single_card_tests_non_parallel" 1 # run cases 1 job each time with single GPU + card_test "$multiple_card_tests_two_parallel" 2 2 # run cases 2 job each time with two GPUs card_test "$multiple_card_tests_non_parallel" 2 # run cases 1 job each time with two GPUs + card_test "$exclusive_tests_two_parallel" -1 2 # run cases exclusively, in this cases would be run with 2/4/8 GPUs card_test "$exclusive_tests_non_parallel" -1 # run cases exclusively, in this cases would be run with 2/4/8 GPUs collect_failed_tests @@ -1282,8 +1227,10 @@ set +x if [ $need_retry_ut_count -lt $exec_retry_threshold ];then while ( [ $exec_times -lt $retry_time ] ) do + set +e retry_unittests_record="$retry_unittests_record$failed_test_lists" failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'` + set -e if [[ "${exec_times}" == "1" ]];then if [[ "${failed_test_lists}" == "" ]];then break @@ -1409,6 +1356,186 @@ EOF fi } +function insert_pile_to_h_cu_diff { + # TODO get develop h/cu md5 + cd ${PADDLE_ROOT} + find ${PADDLE_ROOT} -name '*.cu'| grep -v ${PADDLE_ROOT}/build >> ${PADDLE_ROOT}/tools/h_cu_files.log + python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'get_h_file_md5' ${PADDLE_ROOT} + + # TODO insert pile to diff h/cu file + + #insert pile to full h/cu file + python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'insert_pile_to_h_file' ${PADDLE_ROOT} +} + +function precise_card_test_single { + set +e + set +x + testcases=$1 + num=$2 + for case in $(echo $testcases | tr "$|^" "\n") + do + cd ${PADDLE_ROOT}/build + precise_card_test "^${case}$" $num + # c++ + if [ ! -d "${PADDLE_ROOT}/build/ut_map/$case" ];then + mkdir ${PADDLE_ROOT}/build/ut_map/$case + fi + set -x + find paddle/fluid -name '*.gcda'|xargs -I {} cp --path {} ut_map/$case + find paddle/fluid -name '*.gcno'|xargs -I {} cp --path {} ut_map/$case + python ${PADDLE_ROOT}/tools/get_single_test_cov.py ${PADDLE_ROOT} $case & + + # python + ls python-coverage.data.* + if [[ $? == 0 ]] + then + if [ ! -d "${PADDLE_ROOT}/build/pytest/$case" ];then + mkdir -p ${PADDLE_ROOT}/build/pytest/$case + fi + mv python-coverage.data.* ${PADDLE_ROOT}/build/pytest/$case + fi + find paddle/fluid -name *.gcda | xargs rm -f #delete gcda + done +} + +function precise_card_test() { + set -m + testcases=$1 + if (( $# > 1 )); then + cardnumber=$2 + cuda_list="0" + if [ $cardnumber -eq 2 ]; then + cuda_list=${CUDA_VISIBLE_DEVICES} + else + cuda_list="0" + fi + else + cardnumber=2 + cuda_list=${CUDA_VISIBLE_DEVICES} + fi + + if [[ "$testcases" == "" ]]; then + return 0 + fi + + echo "****************************************************************" + echo "***Running ut: $testcases***" + echo "****************************************************************" + + tmpfile=$tmp_dir/$testcases".log" + env CUDA_VISIBLE_DEVICES=$cuda_list ctest -I 0,,1 -R "($testcases)" --timeout 500 --output-on-failure -V -j 1 > $tmpfile + set +m +} + +function get_precise_tests_map_file { + cd ${PADDLE_ROOT}/build + pip install ${PADDLE_ROOT}/build/python/dist/*whl + ut_total_startTime_s=`date +%s` + EXIT_CODE=0; + test_cases=$(ctest -N -V) # get all test cases + single_card_tests='' # all cases list which would take one graph card + exclusive_tests='' # cases list which would be run exclusively + multiple_card_tests='' # cases list which would take multiple GPUs, most cases would be two GPUs + is_exclusive='' # indicate whether the case is exclusive type + is_multicard='' # indicate whether the case is multiple GPUs type +set +x + + while read -r line; do + if [[ "$line" == "" ]]; then + continue + fi + read matchstr <<< $(echo "$line"|grep -oEi 'Test[ \t]+#') + if [[ "$matchstr" == "" ]]; then + # Any test case with LABELS property would be parse here + # RUN_TYPE=EXCLUSIVE mean the case would run exclusively + # RUN_TYPE=DIST mean the case would take two graph GPUs during runtime + read is_exclusive <<< $(echo "$line"|grep -oEi "RUN_TYPE=EXCLUSIVE") + read is_multicard <<< $(echo "$line"|grep -oEi "RUN_TYPE=DIST") + continue + fi + read testcase <<< $(echo "$line"|grep -oEi "\w+$") + + if [[ "$is_multicard" == "" ]]; then + # trick: treat all test case with prefix "test_dist" as dist case, and would run on 2 GPUs + read is_multicard <<< $(echo "$testcase"|grep -oEi "test_dist_") + fi + + if [[ "$is_exclusive" != "" ]]; then + if [[ "$exclusive_tests" == "" ]]; then + exclusive_tests="^$testcase$" + else + exclusive_tests="$exclusive_tests|^$testcase$" + fi + elif [[ "$is_multicard" != "" ]]; then + if [[ "$multiple_card_tests" == "" ]]; then + multiple_card_tests="^$testcase$" + else + multiple_card_tests="$multiple_card_tests|^$testcase$" + fi + else + if [[ "${single_card_tests}" -gt 3000 ]];then + if [[ "$single_card_tests_1" == "" ]]; then + single_card_tests_1="^$testcase$" + else + single_card_tests_1="$single_card_tests_1|^$testcase$" + fi + continue + fi + if [[ "$single_card_tests" == "" ]]; then + single_card_tests="^$testcase$" + else + single_card_tests="$single_card_tests|^$testcase$" + fi + fi + is_exclusive='' + is_multicard='' + is_nightly='' + matchstr='' + testcase='' + done <<< "$test_cases"; + +set -x + mkdir -p ${PADDLE_ROOT}/build/ut_map + mkdir -p ${PADDLE_ROOT}/build/pytest + + precise_card_test_single "$single_card_tests" 1 + precise_card_test_single "$single_card_tests_1" 1 + precise_card_test_single "$multiple_card_tests" 2 + precise_card_test_single "$exclusive_tests" + wait; + python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_not_success_ut' ${PADDLE_ROOT} + + #analy h/cu to Map file + python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'analy_h_cu_file' $tmp_dir ${PADDLE_ROOT} + + wait; + get_failedUts_precise_map_file + + #generate python coverage and generate python file to tests_map_file + python ${PADDLE_ROOT}/tools/pyCov_multithreading.py ${PADDLE_ROOT} + wait; + + #generate ut map + python ${PADDLE_ROOT}/tools/get_ut_file_map.py 'get_ut_map' ${PADDLE_ROOT} +} + +function get_failedUts_precise_map_file { + if [[ -f "${PADDLE_ROOT}/build/utNotSuccess" ]]; then + rerun_tests=`cat ${PADDLE_ROOT}/build/utNotSuccess` + #remove pile to full h/cu file + python ${PADDLE_ROOT}/tools/handle_h_cu_file.py 'remove_pile_from_h_file' ${PADDLE_ROOT} + cd ${PADDLE_ROOT}/build + cmake_base ${PYTHON_ABI:-""} + build ${parallel_number} + pip uninstall -y paddlepaddle-gpu + pip install ${PADDLE_ROOT}/build/python/dist/*whl + precise_card_test_single "$rerun_tests" + wait; + + fi +} + function parallel_test_base_xpu() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build @@ -1446,10 +1573,11 @@ set -x } function parallel_test() { - ut_total_startTime_s=`date +%s` mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build pip install ${PADDLE_ROOT}/build/python/dist/*whl + cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python + ut_total_startTime_s=`date +%s` if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then parallel_test_base_gpu else @@ -1550,70 +1678,38 @@ EOF ref_web=https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl} - ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl - ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl - ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl - ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then - ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl - ref_paddle35=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl - ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl - ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl fi - #ref_paddle2_mv1="" - #ref_paddle2_mv2="" - ref_paddle35_mv1="" - ref_paddle35_mv2="" ref_paddle36_mv1="" ref_paddle36_mv2="" - #ref_paddle37_mv1="" - #ref_paddle37_mv2="" if [[ ${PADDLE_BRANCH} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then - #ref_paddle2_whl=paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl - ref_paddle35_whl=paddlepaddle_gpu-1.5.1-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle_gpu-1.5.1-cp36-cp36m-linux_x86_64.whl - #ref_paddle37_whl=paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl - #ref_paddle2_mv1="mv ref_paddle2 paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl &&" - #ref_paddle2_mv2="&& mv paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl ref_paddle2" - ref_paddle35_mv1="mv ${ref_paddle35} ${ref_paddle35_whl} &&" - ref_paddle35_mv2="&& mv ${ref_paddle35_whl} ${ref_paddle35}" ref_paddle36_mv1="mv ${ref_paddle36} ${ref_paddle36_whl} &&" ref_paddle36_mv2="&& mv ${ref_paddle36_whl} ${ref_paddle36}" - #ref_paddle37_mv1="mv ref_paddle37 paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl &&" - #ref_paddle37_mv2="&& mv paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl ref_paddle37" fi if [[ ${PADDLE_BRANCH} == "0.0.0" && ${WITH_GPU} != "ON" ]]; then - #ref_paddle2_whl=paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl - ref_paddle35_whl=paddlepaddle-1.5.1-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle-1.5.1-cp36-cp36m-linux_x86_64.whl - #ref_paddle37_whl=paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl - #ref_paddle2_mv1="mv ref_paddle2 paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl &&" - #ref_paddle2_mv2="&& mv paddlepaddle_gpu-1.5.1-cp27-cp27mu-linux_x86_64.whl ref_paddle2" - ref_paddle35_mv1="mv ${ref_paddle35} ${ref_paddle35_whl} &&" - ref_paddle35_mv2="&& mv ${ref_paddle35_whl} ${ref_paddle35}" ref_paddle36_mv1="mv ${ref_paddle36} ${ref_paddle36_whl} &&" ref_paddle36_mv2="&& mv ${ref_paddle36_whl} ${ref_paddle36}" - #ref_paddle37_mv1="mv ref_paddle37 paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl &&" - #ref_paddle37_mv2="&& mv paddlepaddle_gpu-1.5.1-cp37-cp37m-linux_x86_64.whl ref_paddle37" fi cat > ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/Dockerfile <> ${PADDLE_ROOT}/build/build_summary.txt build_size "paddle_inference" + build_size "paddle_inference_c" } function tar_fluid_lib() { @@ -1800,6 +1896,26 @@ EOF fi } +function test_go_inference_api() { + cat <&2 exit 5 @@ -1897,14 +2017,55 @@ function summary_check_problems() { set -x } + +function reuse_so_cache() { + get_html="https://api.github.com/repos/PaddlePaddle/Paddle" + curl -X GET ${get_html}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt + merge_commit=`grep "sha" tmp.txt| awk -F \" 'NR==1{print $(NF-1)}'| sed 's# ##g'` + curl -X GET ${get_html}/commits/${merge_commit} -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt + merge_pr=`grep -oP -m 1 '(#[0-9]*)' tmp.txt| sed 's/#//g'` + curl -X GET ${get_html}/pulls/${merge_pr}/commits -H "authorization: token ${GITHUB_API_TOKEN}" >tmp.txt + pr_commit=`grep "sha" tmp.txt |tail -3|head -1|awk -F : '{print $NF}'|sed 's#"##g'|sed 's#,##g'| sed 's# ##g'` + set +e + wget -q https://xly-devops.bj.bcebos.com/PR/Paddle/${merge_pr}/${pr_commit}/workspace/Paddle/build/proto_so.tar.gz + down_proto_so=`echo $?` + set -e + if [ "${down_proto_so}" -eq 0 ];then + export CI_SKIP_CPP_TEST=ON + cd build && mv ../proto_so.tar.gz . + tar --use-compress-program=pigz -xpf proto_so.tar.gz + cmake_gen ${PYTHON_ABI:-""} ${parallel_number} + cd python + touch stub.cc + alias cp=cp + cp -r ../../python/paddle . + python setup.py bdist_wheel + else + cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + fi +} + +function find_temporary_files() { + set +x + jsonData=`curl \ + -H "Authorization: token ${GITHUB_API_TOKEN}"\ + -H "Accept: application/vnd.github.v3+json" \ + https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/files` + + result=`echo ${jsonData}|python ${PADDLE_ROOT}/tools/check_file_suffix.py` + + if [ ${#result} -gt 0 ] + then + echo ${result} + exit 65 + fi +} + + function main() { local CMD=$1 local parallel_number=$2 init - if [ "$CMD" != "assert_file_approvals" ];then - python ${PADDLE_ROOT}/tools/summary_env.py - bash ${PADDLE_ROOT}/tools/get_cpu_info.sh - fi case $CMD in build_only) cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} @@ -1913,14 +2074,21 @@ function main() { set +e check_style_info=$(check_style) check_style_code=$? + find_temporary_files generate_upstream_develop_api_spec ${PYTHON_ABI:-""} ${parallel_number} cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} check_sequence_op_unittest generate_api_spec ${PYTHON_ABI:-""} "PR" set +e - example_info=$(example) + example_info_gpu="" + example_code_gpu=0 + if [ "${WITH_GPU}" == "ON" ] ; then + example_info_gpu=$(exec_samplecode_test gpu) + example_code_gpu=$? + fi + example_info=$(exec_samplecode_test cpu) example_code=$? - summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info" + summary_check_problems $check_style_code $[${example_code_gpu} + ${example_code}] "$check_style_info" "${example_info_gpu}\n${example_info}" assert_api_spec_approvals ;; build) @@ -1960,6 +2128,7 @@ function main() { test_fluid_lib ;; build_inference_lib) + python ${PADDLE_ROOT}/tools/remove_grad_op_and_kernel.py cmake_gen ${PYTHON_ABI:-""} gen_fluid_lib ${parallel_number} ;; @@ -1981,6 +2150,24 @@ function main() { check_coverage check_change_of_unittest ${PYTHON_ABI:-""} ;; + cpu_cicheck_coverage) + check_approvals_of_unittest 1 + check_diff_file_for_coverage + cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + enable_unused_var_check + ;; + gpu_cicheck_coverage) + check_approvals_of_unittest 1 + parallel_test + check_coverage + check_change_of_unittest ${PYTHON_ABI:-""} + ;; + ci_preciseTest) + insert_pile_to_h_cu_diff + cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + enable_unused_var_check + get_precise_tests_map_file + ;; cicheck_brpc) cmake_gen ${PYTHON_ABI:-""} build ${parallel_number} @@ -1996,6 +2183,8 @@ function main() { gen_fluid_lib ${parallel_number} test_fluid_lib #test_fluid_lib_train + #go inference test + test_go_inference_api ;; test_train) gen_fluid_lib ${parallel_number} @@ -2024,6 +2213,12 @@ function main() { cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} parallel_test ;; + cpu_cicheck_py35) + cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + ;; + gpu_cicheck_py35) + parallel_test + ;; check_xpu) cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} parallel_test @@ -2038,6 +2233,10 @@ function main() { parallel_test check_coverage ;; + reuse_so_cicheck_py35) + reuse_so_cache + parallel_test + ;; cmake_gen) cmake_gen ${PYTHON_ABI:-""} ;; @@ -2056,7 +2255,11 @@ function main() { build_document_preview ;; api_example) - example + example_info=$(exec_samplecode_test cpu) + example_code=$? + check_style_code=0 + check_style_info= + summary_check_problems $check_style_code $example_code "$check_style_info" "$example_info" ;; test_op_benchmark) test_op_benchmark diff --git a/patches/eigen/TensorReductionGpu.h b/patches/eigen/TensorReductionGpu.h new file mode 100644 index 0000000000000000000000000000000000000000..696078e54881afaa69566570d780541b9d383da6 --- /dev/null +++ b/patches/eigen/TensorReductionGpu.h @@ -0,0 +1,996 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// clang-format off +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H + +namespace Eigen { +namespace internal { + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple gpu thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another gpu thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + gpu_assert(0 && "Wordsize not supported"); + } +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_GPU_FP16 +template